index.html

<!DOCTYPE html>
<html lang="en">
<head>
    <title>Data Mesh Architecture</title>
    <meta charset="utf-8">
    <meta name="description" content="A data mesh architecture is a decentralized approach that enables domain teams to perform cross-domain data analysis on their own." />
    <meta name="keywords" content="data mesh, data mesh architecture, domain-driven data analytics, data analytics, domain-driven design, domain ownership, data as a product, data product, federated governance, self-serve data platform, data platform">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="twitter:card" content="summary_large_image" />
    <meta name="twitter:site" content="@innoq" />
    <meta name="twitter:title" content="Data Mesh Architecture" />
    <meta name="twitter:description" content="Data Mesh From an Engineering Perspective" />
    <meta name="twitter:image" content="https://www.datamesh-architecture.com/images/datamesharchitecture_card.png" />
    <meta name="twitter:image:alt" content="Data Mesh Architecture: Domains are in the center and teams do analytics on their own. They build and interconnect with data products. A data platform team and a enablement team help. Global policies are agreed through federated governance." />
    <meta property="og:url" content="https://datamesh-architecture.com" />
    <meta property="og:title" content="Data Mesh Architecture" />
    <meta property="og:description" content="Data Mesh From an Engineering Perspective" />
    <meta property="og:image" content="https://www.datamesh-architecture.com/images/datamesharchitecture_card.png" />

    <link rel="preload" as="font" type="font/woff2" href="https://www.innoq.com/assets/MarkPro-Book.woff2?cachebuster=2" crossorigin="">
    <link rel="preload" as="font" type="font/woff2" href="https://www.innoq.com/assets/MarkPro-Bold.woff2?cachebuster=2" crossorigin="">
    <link rel="preload" as="font" type="font/woff2" href="https://www.innoq.com/assets/MarkPro-Heavy.woff2?cachebuster=2" crossorigin="">
    <link rel="stylesheet" href="css/style.css" />
    <link rel="stylesheet" href="css/0.9.3_css_bulma.css" />
    <link rel="stylesheet" href="css/font-awesome_6.0.0_css_all.css"/>
    <link rel="shortcut icon" type="image/x-icon" href="/favicon.ico">
</head>
<body>
<section class="hero is-fullheight" style="background-color: #242424">
    <!--<div class="has-text-centered py-3" id="banner-data-product-canvas" style="background-color: #FF9B66;">
        <strong>🔥 NEW: We published a <a href="/data-product-canvas.html" style="color: inherit; text-decoration: underline">Data Product Canvas</a> to design data products.</strong>
    </div>-->
    <div class="has-text-centered py-3" id="banner-data-product-canvas" style="font-family: 'Mark Pro',sans-serif;  font-weight: 800; background-color: #FF9B66;">
        <div>🚀 NEW: We launched <a href="https://www.datamesh-manager.com/?ref=dma-banner" style="color: inherit; text-decoration: underline">Data Mesh Manager</a> to build a data product inventory and manage data contracts. 🚀</div>
    </div>
<!--    <div class="has-text-centered py-3" id="banner-training" style="background-color: #FF9B66; display: none;">-->
<!--        <strong>🇩🇪 Wir bieten ein <a href="https://www.socreatory.com/de/trainings/datamesh?ref=dma" style="color: inherit; text-decoration: underline">2-Tages-Training</a> zu Data Mesh an</strong>-->
<!--        <br/>-->
<!--        <strong>🇩🇪 NEU: <a href="https://oreilly.de/produkt/data-mesh/" style="color: inherit; text-decoration: underline">Wir haben das Data Mesh Buch von Zhamak Dehghani auf Deutsch übersetzt.</a></strong>-->
<!--    </div>-->
<!--    <script type="text/javascript">-->
<!--      const languages = navigator.languages.join().toLowerCase();-->
<!--      if(languages.indexOf("de") > -1) {-->
<!--        document.getElementById("banner-training").style.display = "block";-->
<!--        document.getElementById("banner-data-product-canvas").style.display = "none";-->
<!--      } else {-->
<!--        document.getElementById("banner-training").style.display = "none";-->
<!--        document.getElementById("banner-data-product-canvas").style.display = "block";-->
<!--      }-->
<!--    </script>-->

    <div class="hero-head">
        <nav class="navbar is-dark" role="navigation" aria-label="dropdown navigation">
            <div class="container">
                <div class="navbar-brand">
                    <span class="navbar-burger" data-target="navbarMenuHeroA">
            <span></span>
            <span></span>
            <span></span>
          </span>
                </div>
                <div id="navbarMenuHeroA" class="navbar-menu">

                    <div class="navbar-end">


                        <a href="/#why" class="navbar-item">
                            Why
                        </a>
                        <a href="/#what-is-data-mesh" class="navbar-item">
                            What
                        </a>
                        <div class="navbar-item has-dropdown is-hoverable">
                            <a href="/#how-to-design-a-data-mesh" class="navbar-link is-arrowless">
                                How
                            </a>
                            <div class="navbar-dropdown" id="navbarMenuArchitectureDropdown">
                                <a href="/#how-to-design-a-data-mesh" class="navbar-item">Data Mesh Architecture</a>
                                <hr class="navbar-divider">
                                <a href="/#data-product" class="navbar-item">Data Product</a>
                                <a href="/#federated-governance" class="navbar-item">Federated Governance</a>
                                <a href="/#analytical-data" class="navbar-item">Analytical Data</a>
                                <a href="/#ingesting" class="navbar-item">Ingesting</a>
                                <a href="/#clean-data" class="navbar-item">Clean Data</a>
                                <a href="/#analytics" class="navbar-item">Analytics</a>
                                <a href="/#data-platform" class="navbar-item">Data Platform</a>
                                <a href="/#enabling-team" class="navbar-item">Enabling Team</a>
                            </div>
                        </div>
                        <div class="navbar-item has-dropdown is-hoverable">
                            <div class="navbar-link is-arrowless">
                                Specifications
                            </div>
                            <div class="navbar-dropdown" id="navbarMenuSpecificationsDropdown">
                                <a href="https://www.dataproduct-specification.com" class="navbar-item">Data Product Specification</a>
                                <a href="https://www.datacontract.com" class="navbar-item">Data Contract Specification</a>
                            </div>
                        </div>
                        <div class="navbar-item has-dropdown is-hoverable">
                            <div class="navbar-link is-arrowless">
                                Open Source
                            </div>
                            <div class="navbar-dropdown" id="navbarMenuArchitectureToolsDropdown">
                                <a href="/data-product-canvas.html" class="navbar-item">Data Product Canvas</a>
                                <a href="/fitness-test.html" class="navbar-item">Fitness Test</a><a href="https://cli.datacontract.com" class="navbar-item">Data Contract CLI</a>
                                <a href="/open-source/aws.html" class="navbar-item">AWS Terraform Modules</a>
                                <a href="/open-source/gcp.html" class="navbar-item">GCP Terraform Modules</a>
                            </div>
                        </div>
                        <div class="navbar-item has-dropdown is-hoverable">
                            <a href="/#tech-stacks" class="navbar-link is-arrowless">
                                Tech Stacks
                            </a>
                            <div class="navbar-dropdown" id="navbarMenuTechStackDropdown">
                                <a href="/tech-stacks/google-cloud-bigquery.html" class="navbar-item">
                                    Google Cloud BigQuery
                                </a>
                                <a href="/tech-stacks/aws-s3-athena.html" class="navbar-item">
                                    AWS S3 and AWS Athena
                                </a>
                                <a href="/tech-stacks/azure-synapse-analytics.html" class="navbar-item">
                                    Azure Synapse Analytics
                                </a>
                                <a href="/tech-stacks/dbt-snowflake.html" class="navbar-item">
                                    dbt and Snowflake
                                </a>
                                <a href="/tech-stacks/databricks.html" class="navbar-item">
                                    Databricks
                                </a>
                                <a href="/tech-stacks/minio-trino.html" class="navbar-item">
                                    MinIO and Trino
                                </a>
                                <a href="/tech-stacks/sap.html" class="navbar-item">
                                    SAP
                                </a>
                            </div>
                        </div>
                        <div class="navbar-item has-dropdown is-hoverable">
                            <a href="#domain-teams-journey" class="navbar-link is-arrowless">
                                Start the Journey
                            </a>
                            <div class="navbar-dropdown is-right" id="navbarMenuTransformationDropdown">
                                <a href="#domain-teams-journey" class="navbar-item">
                                    Domain Team’s Journey
                                </a>
                                <a href="#data-teams-journey" class="navbar-item">
                                    Data Team’s Journey
                                </a>
                                <a href="/literature.html" class="navbar-item">
                                    Scientific Literature
                                </a>
                                <a href="/real-world-learnings.html" class="navbar-item">
                                    Real World Learnings
                                </a>
                            </div>
                        </div>
                        <a href="https://www.datamesh-manager.com" class="navbar-item" style="color: #FF9B66">
                            Data Mesh Manager
                        </a>

                    </div>
                </div>
            </div>
        </nav>

    </div>
    <div class="hero-body">
        <div class="">
            <p class="title" style="color: white; font-family: 'Mark Pro',sans-serif; font-weight: 800; font-size: 10vw; letter-spacing: -1px; text-transform: uppercase; line-height: 1.0; margin-bottom: 2rem;">
                Data Mesh <br><span style="color: #FF9B66;">Architecture</span>
            </p>
            <p class="subtitle" style="color: white; font-family: 'Freight Text','Georgia,Times','Times New Roman',serif; font-style: italic; font-size: 4vw; font-weight: 500;">Data Mesh From an Engineering Perspective</p>
        </div>
    </div>
    <div class="hero-foot">
        <div class="columns is-mobile is-centered" style="font-size: 3em;">
            <div class="column is-half" style="text-align: center;">
                <a href="#why" style="color: #FF9B66;">
                <span class="icon is-medium">
                    <i class="fa-solid fa-angle-down"></i>
                </span>
                </a>
            </div>
        </div>
    </div>
</section>
<div class="container">

    <section class="section is-large">
        <h2 class="title" id="why" >Why You May Need a Data Mesh</h2>
        <div class="columns is-vcentered reverse-columns">
            <div class="column content mb-0 pb-1">
                <p>
                    Many organizations have invested in a central data lake and a data team with the
                    expectation to drive their business based on data. However, after a few initial quick wins,
                    they notice that <strong>the central data team often becomes a bottleneck</strong>.
                    The team cannot handle all the analytical questions of management and product owners quickly enough.
                    This is a massive problem because making timely data-driven decisions is crucial to stay competitive.
                    For example:
                    Is it a good idea to offer free shipping during Black Week?
                    Do customers accept longer but more reliable shipping times?
                    How does a product page change influence the checkout and returns rate?
                </p>
                <p>
                    The data team wants to answer all those questions quickly.
                    In practice, however, they struggle because they need to spend too much time fixing broken data pipelines after operational database changes.
                    In their little time remaining, <strong>the data team has to discover and understand the necessary domain data</strong>.
                    For every question, they need to learn domain knowledge to give meaningful insights.
                    Getting the required domain expertise is a daunting task.
                </p>
            </div>
            <div class="column">
                <figure class="image">
                    <img src="images/whyteam.png.webp" alt="The central data team in the middle surrounded by all domain teams, CEO, CFO, and marketing who all have an information need the central data team must fulfill, and for that, the central data team needs to import and understand the data of all domain teams.">
                </figure>
            </div>
        </div>
        <div class="columns is-vcentered">
            <div class="column">
                <figure class="image">
                    <img src="images/whychecklist.png.webp" alt="You already scaled up your software development by: 1. Decentralize business into domains; 2. Decentralize engineering into autonomous teams; 3. Decentralize monolith into microservices; 4. Decentralize operations into DevOps teams. Next step: scale up data analytics by decentralizing data lake into data mesh">
                </figure>
            </div>

            <div class="column content">
                <p>
                    On the other hand, organizations have also invested in domain-driven design, autonomous domain teams (also known as stream-aligned teams or product teams) and a decentralized microservice architecture.
                    These <strong>domain teams own and know their domain</strong>, including the information needs of the business.
                    They design, build, and run their web applications and APIs on their own.
                    Despite knowing the domain and the relevant information needs, the domain teams have to reach out to the overloaded central data team to get the necessary data-driven insights.
                </p>
                <p>
                    With the eventual growth of the organization, the situation of the domain teams and the central data team becomes worse.
                    A way out of this is to shift the responsibility for data from the central data team to the domain teams.
                    This is the core idea behind the data mesh concept:
                    <strong>Domain-oriented decentralization for analytical data</strong>.
                    A data mesh architecture enables domain teams to perform cross-domain data analysis on their own and interconnects data, similar to APIs in a microservice architecture.
                </p>

            </div>
        </div>
    </section>

    <section class="section is-medium">
        <div class="columns is-vcentered">
            <div class="column is-half is-offset-half">
            <h2 class="title" id="what-is-data-mesh">What Is Data Mesh?</h2>
            </div>
        </div>
        <div class="columns is-vcentered">

            <div class="column">
                <a href="images/datamesh.png.webp" class="glightbox">
                    <img src="images/datamesh.png.webp" alt="What Is Data Mesh? Includes the four principles Domain Ownership, Data as a Product, Self-serve Data Platform, and Federated Governance.">
                </a>
            </div>

            <div class="column">
                <div class="content">
                    <p>
                        The term <em>data mesh</em> was coined by <a href="https://martinfowler.com/articles/data-mesh-principles.html">Zhamak Dehghani&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> in 2019 and is based on four fundamental principles that bundle well-known concepts:
                    </p>
                    <p>
                        The <strong>domain ownership</strong> principle mandates the domain teams to take responsibility for their data.
                        According to this principle, analytical data should be composed around domains, similar to the team boundaries aligning with the system’s bounded context.
                        Following the domain-driven distributed architecture, analytical and operational data ownership is moved to the domain teams, away from the central data team.
                    </p>
                    <p>
                        The <strong>data as a product</strong> principle projects a product thinking philosophy onto analytical data.
                        This principle means that there are consumers for the data beyond the domain.
                        The domain team is responsible for satisfying the needs of other domains by providing high-quality data.
                        Basically, domain data should be treated as any other public API.
                    </p>
                    <p>
                        The idea behind the <strong>self-serve data infrastructure platform</strong> is to adopt platform thinking to data infrastructure.
                        A dedicated data platform team provides domain-agnostic functionality, tools, and systems to build, execute, and maintain interoperable data products for all domains.
                        With its platform, the data platform team enables domain teams to seamlessly consume and create data products.
                    </p>
                    <p>
                        The <strong>federated governance</strong> principle achieves interoperability of all data products through standardization, which is promoted through the whole data mesh by the governance group.
                        The main goal of federated governance is to create a data ecosystem with adherence to the organizational rules and industry regulations.
                    </p>
                </div>
            </div>
        </div>
    </section>

    <section class="section is-medium">
        <h2 class="title" id="how-to-design-a-data-mesh">How To Design a Data Mesh?</h2>
        <div class="content">
            <p class="mt-5">
                A data mesh architecture is a decentralized approach that enables domain teams to perform cross-domain data analysis on their own.
                At its core is the domain with its responsible team and its operational and analytical data.
                The domain team ingests operational data and builds analytical data models as data products to perform their own analysis.
                It may also choose to publish data products with data contracts to serve other domains’ data needs.
            </p>
            <img src="images/datamesharchitecture.png.webp" alt="Data Mesh Architecture" style="width: 100%">
            <p>
                The domain team agrees with others on global policies, such as interoperability, security, and documentation standards in a federated governance group, so that domain teams know how to discover, understand and use data products available in the data mesh.
                The self-serve domain-agnostic data platform, provided by the data platform team, enables domain teams to easily build their own data products and do their own analysis effectively.
                An enabling team guides domain teams on how to model analytical data, use the data platform, and build and maintain interoperable data products.
            </p>
            <p>
                Let’s zoom in to the core components of a data mesh architecture and their relationships:
            </p>
        </div>


        <div class="card mt-5">
            <div class="card-content">
                <h4 class="title is-4">Data Product</h4>
                <div class="columns reverse-columns">
                    <div class="column content">
                        <p>
                            A data product is a logical unit that contains all components to process and store domain data for analytical or data-intensive use cases and makes them available to other teams via output ports. You can think of a microservice, but for analytical data.
                        </p>
                        <p>
                            Data products connect to sources, such as operational systems or other data products and perform data transformation. Data products serve data sets in one or many output ports. Some examples:
                        </p>
                        <ul>
                            <li>A BigQuery dataset with multiple related tables</li>
                            <li>Parquet files in an AWS S3 bucket</li>
                            <li>Delta files in Azure Data Lake Storage Gen2</li>
                            <li>Dashboards in Looker</li>
                            <li>A machine learning model as an ONNX file</li>
                        </ul>
                        <p>
                            When data products provide data for other teams, a data contract defines the endpoint, syntax, semantics, and quality of provided data, similar to OpenAPI or AsyncAPI specifications. A data contract is implemented by an output port.
                        </p>
                        <p>
                            A data product is owned by a domain team.
                            The team is responsible for the operations of the data product during its entire lifecycle.
                            The team needs to continuously monitor and ensure data quality, availability, and costs.
                            For example, keep the data without duplicates or react to missing entries.
                        </p>
                        <p>
                            To design data products, we recommend to use the <strong><a href="/data-product-canvas.html">Data Product Canvas</a></strong>.
                        </p>
                    </div>
                    <div class="column">

                        <p class="image">
                            <img src="images/data-product-canvas-template.png.webp" alt="Data Product consists of Data, Metadata, and Operations">
                        </p>
                    </div>
                </div>
            </div>
        </div>

        <div class="card mt-5">
            <div class="card-content">
                <h4 class="title is-4">Data Contract</h4>
                <div class="columns reverse-columns">
                    <div class="column content">
                        <p>
                            A data contract is a document that defines the structure, format, semantics, quality, and terms of use for exchanging data between a data provider and their consumers.
                            It covers:
                        </p>
                        <ul>
                            <li>Data Product Provider, including owner and the output port to access</li>
                            <li>Terms and conditions of data usage</li>
                            <li>Schema and semantics of provided data attributes</li>
                            <li>Quality attributes, such as freshness and number of rows</li>
                            <li>Service-level objectives, such as availability and support times</li>
                            <li>Billing details for using data</li>
                        </ul>
                        <p>
                            While a data contract represents the interface specification, the actual implementation that provides the data is the output port of a data product.
                        </p>
                        <p>
                            Data contracts come into play when data is exchanged between different teams or organizational units. First, and foremost, data contracts are a communication tool to express a common understanding of how data should be structured and interpreted. They make semantic and quality expectations explicit. Later in development and production, they also serve as the basis for code generation, testing, schema validations, quality checks, monitoring, access control, and computational governance policies. Data contracts can also be used for the input port for consumer-driven contract testing to verify that the data is provided as specified.
                        </p>
                        <p>
                            The <strong><a href="https://datacontract.com/">Data Contract Specification</a></strong> defines a YAML format to describe the terms of use and attributes of provided data sets.
                        </p>
                    </div>
                    <div class="column">
<!--                        <figure class="media-right">-->
<!--                            <p class="image">-->
<!--                                <a href="images/datacontract.png.webp" class="glightbox">-->
<!--                                    <img src="images/datacontract.png.webp" alt="Data contract">-->
<!--                                </a>-->
<!--                            </p>-->
<!--                        </figure>-->
                        <figure class="media-right">
                            <p class="image">
                                <a href="images/datacontract-example.png" class="glightbox">
                                    <img src="images/datacontract-example.png" alt="Data contract example">
                                </a>
                            </p>
                        </figure>
                    </div>
                </div>
            </div>
        </div>

        <div class="card mt-5">
            <div class="card-content">
                <h4 class="title is-4">Federated Governance</h4>
                <div class="columns reverse-columns">
                    <div class="column content">
                        <p>
                            The federated governance group is typically organized as a guild consisting of representatives of all teams taking part in the data mesh.
                            They agree on global policies, which are the rules of play in the data mesh.
                            These rules define how the domain teams have to build their data products.
                        </p>
                        <p>
                            Policies on <strong>interoperability</strong> are the starting point.
                            They allow other domain teams to use data products in a consistent way.
                            For example, global policies could define that the standard way to provide data is as a CSV file on AWS S3 in a bucket owned by the corresponding domain team.
                        </p>
                        <p>
                            Next, there has to be some form of <strong>documentation</strong> to discover and understand available data products.
                            A simple policy for this could be a wiki page with a predefined set of metadata, such as owner of the data product, location URL, and descriptions of the CSV fields.
                        </p>
                        <p>
                            A uniform way to access the actual data product in a <strong>secure</strong> way could be using role-based access in AWS IAM, managed by the domain team.
                        </p>
                        <p>
                            Global policies such as <strong>privacy</strong> and <strong>compliance</strong> are also common. Think about protection of personally identifiable information (PII) or industry-specific legal requirements.
                        </p>
                        <p>
                            Lots of example policies are available on our other website <strong><a href="https://datamesh-governance.com">datamesh-governance.com</a></strong>
                            that you easily use in the <strong><a href="https://www.datamesh-manager.com">Data Mesh Manager</a></strong>, our tool for data mesh governance.
                        </p>
                    </div>

                    <div class="column">
                        <figure class="media-right">
                            <p class="image">
                                <a href="images/datamesh.png.webp" class="glightbox">
                                    <img src="images/screenshot_policy_details.png" alt="Four examples of global polices">
                                </a>
                            </p>
                        </figure>
                    </div>
                </div>

            </div>
        </div>

        <div class="card mt-5">
            <div class="card-content">
                <h4 class="title is-4">Transformations</h4>
                <div class="columns reverse-columns">
                    <div class="column content">
                        <p>
                            Diving into the organization of data within a data product, we can see the different kind of data that flows through different stages.
                            Operational data is often ingested as some kind of <strong>raw</strong> and unstructured data.
                        </p>
                        <p>
                            In a preprocessing step, raw data is cleaned and structured into events and entities.
                            <strong>Events</strong> are small, immutable, and highly domain oriented, such as <em>OrderPurchased</em> or <em>ShipmentDelivered</em>.
                            <strong>Entities</strong> represent business objects such as <em>shipments</em> or <em>articles</em> with their state changing over time. That’s why the entities often are represented as a list of snapshots, the history, with the latest snapshot being the current state.
                        </p>
                        <p>
                            In practice, we often see <strong>manually</strong> entered or imported data.
                            For example, forecast data sent via email as CSV files or text descriptions for business codes.
                        </p>
                        <p>
                            Data from other teams are integrated as <strong>external</strong> data.
                            When using data products from other teams that are well-governed, this integration might be implemented in a very lightweight way.
                            In case of importing data from legacy systems, the external area acts as an <a href="https://www.domainlanguage.com/ddd/reference/">anti-corruption layer&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a>.
                        </p>
                        <p>
                            <strong>Aggregations</strong> combine data to answer analytical questions.
                            Domain data can be published to other teams by defining a data contract. The data contract is usually implemented by a view, that is stable, even when the underlying data models change.
                        </p>
                    </div>

                    <div class="column">
                        <figure class="image">
                            <a href="images/analyticaldata.png.webp" class="glightbox">
                                <img src="images/analyticaldata.png.webp" alt="More detailed view on the analytical data of the data mesh architecture">
                            </a>
                        </figure>
                    </div>
                </div>


            </div>
        </div>

        <div class="card mt-5">
            <div class="card-content">
                <h4 class="title is-4">Ingesting</h4>
                <div class="columns reverse-columns">
                    <div class="column content">
                        <p>
                            How can domain teams ingest their operational data into the data platform?
                            A software system designed according to domain-driven design principles contains data as mutable entities/aggregates and immutable domain events.
                        </p>
                        <p>
                            <strong>Domain events</strong> are a great fit to be ingested into the data platform as they represent relevant business facts.
                            If there’s a messaging system in place domain events can be forwarded to the data platform by attaching an additional message consumer.
                            Data can be collected, processed, and forwarded to the data platform in real time.
                            With this <strong>streaming ingestion</strong>, data is sent in small batches when they arrive, so they are immediately available for analytics.
                            As domain events are already well defined, there is little to do in terms of cleaning and preprocessing, except deduplication
                            and anonymization of PII data.
                            Sometimes, it is also advisable to define and ingest internal analytical events that contain information that is relevant only for analytical use cases so that domain events don’t have to be modified.
                            <br>
                            <em>Examples for streaming ingestion: Kafka Connect, Kafka Streams, AWS Lambda</em>
                        </p>
                        <p>
                            Many business objects are persisted as <strong>entities and aggregates</strong> in SQL or NoSQL databases.
                            Their state changes over time, and the latest state is persisted in the database only.
                            Strong candidates for entities with state are <em>articles</em>, <em>prices</em>, <em>customer data</em>, or <em>shipment status</em>.
                            For analytical use cases, it is often required to have both the latest state and the history of states over time.
                            There are several approaches to ingest entities.
                            One way is to generate and publish an <strong>onCreate/onUpdate/onDelete event</strong> with the current state  every time an entity is changed, e.g. by adding an
                            <a href="https://en.wikipedia.org/wiki/Aspect-oriented_programming">aspect&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> or <a href="https://docs.spring.io/spring-data/jpa/docs/current/api/org/springframework/data/jpa/domain/support/AuditingEntityListener.html">EntityListeners&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a>.
                            Then streaming ingestion can be used to ingest the data as described above.
                            When it is not feasible to change the operational software, <strong>change
                            data capture (CDC)</strong> may be used to listen to database changes directly and stream them into the data platform.
                            <br>
                            <em>Examples for CDC streaming: <a href="https://debezium.io">Debezium&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a></em>
                        </p>
                        <p>
                            Lastly, traditional scheduled <strong>ELT or ETL jobs</strong> that export data to file and load them into the platform can be set up, with the downside of not having real-time data, not having all stage changes between exports, and some work to consolidate exported data again. However, they are a viable option for legacy systems, such as mainframes.
                        </p>
                    </div>

                    <div class="column">
                        <figure class="image">
                            <img src="images/ingestingandcleaning.png.webp" alt="Closer look at ingesting and cleaning of data">
                        </figure>
                    </div>
                </div>
            </div>
        </div>

        <div class="card mt-5">
            <div class="card-content">
                <h4 class="title is-4">Clean Data</h4>
                <div class="columns reverse-columns">
                    <div class="column content">
                        <p>
                            Clean data is the foundation for effective data analytics.
                            With data mesh, domain teams are responsible for performing data cleaning.
                            They know their domain and can identify why and how their domain data needs to be processed.
                        </p>
                        <p>
                            Data that is ingested into the data platform is usually imported in its original raw and unstructured format.
                            When using a columnar database, this might be a row per event that contains a
                            <a href="https://en.wikipedia.org/wiki/Character_large_object">CLOB&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> field for the event payload, which may be in JSON format.

                            Now it can be preprocessed to get data clean:
                        </p>
                        <ul>
                            <li><strong>Structuring:</strong> Transform unstructured and semi-structured data to the analytical data model, e.g., by extracting JSON fields into columns.</li>
                            <li><strong>Mitigation of structural changes:</strong> When data structures have changed, mitigate them, e.g., by filling null values with sensible defaults.</li>
                            <li><strong>Deduplication:</strong> As most analytical storage systems are append-only, entities and events cannot be updated. Remove all duplicate entries.</li>
                            <li><strong>Completeness:</strong> Ensure that data contain agreed periods, even when there were technical issues during ingestion.</li>
                            <li><strong>Fix outliers:</strong> Invalid data that may occur through bugs get identified and corrected.</li>
                        </ul>
                        <p>
                            From an implementation perspective, these preprocessing steps can be implemented as simple SQL views that project the raw data.
                            The queries may be organized through
                            <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/query-syntax#with_clause">common table expressions&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> (CTEs)
                            and may be enhanced with <a href="https://cloud.google.com/bigquery/docs/reference/standard-sql/user-defined-functions">user-defined functions&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> (UDFs), e.g., for JSON processing.
                            As an alternative, the cleaning steps can be implemented as lambda functions that operate on topics.
                            More complex pipelines can be built with frameworks like <a href="https://www.getdbt.com">dbt&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> or
                            <a href="https://beam.apache.org">Apache Beam&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> that offer an advanced programming model,
                            but also require more skills to master.
                        </p>
                    </div>
                    <div class="column is-half">
                        <script src="https://gist.github.com/jochenchrist/e7933a5dc2cb86a67ea5df302ca417c6.js"></script>
                    </div>
                </div>
            </div>
        </div>

        <div class="card mt-5">
            <div class="card-content">
                <h4 class="title is-4">Analytics</h4>
                <div class="columns reverse-columns">
                    <div class="column content">
                        <p>
                            To gain insights, domain teams query, process, and aggregate their analytical data together with relevant data products from other domains.
                        </p>
                        <p>
                            <strong>SQL</strong> is the foundation for most analytical queries.
                            It provides powerful functions to connect and investigate data.
                            The data platform should perform join operations efficiently, even for large data sets.
                            Aggregations are used to group data and window functions help to perform a calculation across multiple rows.
                            Notebooks help to build and document exploratory findings.
                            <br>
                            <em>Examples: Jupyter Notebooks, Presto</em>
                        </p>
                        <p>
                            Humans understand data, trends, and anomalies much easier when they perceive them visually.
                            There are a number of great data <strong>visualization</strong> tools that build beautiful charts, key performance indicator overviews, dashboards and reports.
                            They provide an easy-to-use UI to drill down, filter, and aggregate data.
                            <br>
                            <em>Examples: Looker, Tableau, Metabase, Redash</em>
                        </p>
                        <p>
                            For more advanced insights, <strong>data science and machine learning</strong> methods can be applied.
                            These enable correlation analyses, prediction models, and other advanced use cases.
                            Special methodological, statistical, and technological skills are required.
                            <br>
                            <em>Examples: scikit-learn, PyTorch, TensorFlow</em>
                        </p>
                    </div>

                    <div class="column">
                        <figure class="image">
                            <img src="images/notebook.png.webp" alt="Jupyter Notebook executing queries on Google BigQuery">
                        </figure>
                    </div>
                </div>
            </div>
        </div>


        <div class="card mt-5">
            <div class="card-content">
                <div class="content">
                    <h4 class="title is-4">Data Platform</h4>

                    <p>
                    The self-serve data platform may vary for each organization.
                    Data mesh is a new field and vendors are starting to add data mesh capabilities to their existing offerings.
                    </p>

                    <p>
                        Looking from the desired capabilities, you can distinguish between analytical capabilities and data product capabilities:
                            <strong>Analytical capabilities</strong> enable the domain team to build an analytical data model and perform analytics for data-driven decisions.
                        The data platform needs functions to ingest, store, query, and visualize data as a self-service.
                        Typical data warehouse and data lake solutions, whether on-premise or a cloud provider, already exist.
                        The major difference is that each domain team gets its own isolated area.
                    </p>

                    <p>
                        A more advanced data platform for data mesh also provides additional domain-agnostic <strong>data product capabilities</strong>
                        for creating, monitoring, discovering, and accessing data products.
                        The self-serve data platform should support the domain teams so that they can quickly build a data product as well as run it in production in their isolated area.
                        The platform should support the domain team in publishing their data products so that other teams can discover them.
                        The discovery requires a central entry point for all the decentralized data products.
                        A data catalog can be implemented in different ways: as a wiki, git repository, or there are even already vendor solutions for a cloud-based data catalog such as Select Star, Google Data Catalog, or AWS Glue Data Catalog.
                        The actual usage of data products, however, requires a domain team to access, integrate, and query other domains' data products.
                        The platform should support, monitor, and document the cross-domain access and usage of data products.
                    </p>
                    <p>
                        An even more advanced data platform supports <strong>policy automation</strong>.
                        This means that, instead of forcing the domain team to manually ensure that the global policies are not violated, the policies are automatically enforced through the platform.
                        For example, that all data products have the same metadata structure in the data catalog, or that the PII data are automatically removed during data ingestion.
                    </p>
                    <p>
                        Efficiently combining data products from multiple domains, i.e., having large cross-domain join operations within a few seconds, ensures developer acceptance and happiness.
                        That's why the <strong>query engine has a large influence on the architecture of the data platform</strong>.
                        A shared platform with a single query language and support for separated areas is a good way to start as everything is highly integrated.
                        This could be Google BigQuery with tables in multiple projects that are discoverable through Google Data Catalog.
                        In a more decentralized and distributed data mesh, a distributed query engine such as Presto can still perform cross-domain joins without importing data, but they come with their own limitations, e.g., limited pushdowns require that all underlying column data need to be transferred.
                    </p>
                </div>
            </div>
        </div>
<!--
        <div class="card mt-5">
            <div class="card-content">
                <div class="content">
                    <h4 class="title is-4">Data Platform: Storage</h4>
                    <p>
                        From a technical perspective, data mesh relies on data warehouses and data lakes.
                        The data platform gives domain teams the choice, which technology suits best for their use cases.
                    </p>
                    <p>
                        Storing data is a primary purpose of any data platform.
                        As hundreds of terabytes and more may be stored on the platform, storage should be cost-efficient and provide good performance for complex analytical queries.
                    </p>
                    <p>
                        Most common are <strong>SQL-like databases</strong> that store data in tables with a structured schema, with the option of semi-structured fields.
                        These databases usually use columnar data stores to retrieve and search large datasets efficiently, hence there is no need for indexes.
                        For smaller datasets, it is also possible to use traditional relational databases, but these will require more storage management efforts and index optimizations to build efficient queries.
                        <br>
                        <em>Examples for SQL-like databases: Google BigQuery, AWS Redshift, Azure Synapse Analytics, Snowflake, PostgreSQL</em>
                    </p>
                    <p>
                        In some use cases, data needs to be exported from operative or third-party systems as files, such as CSV, JSON or XML.
                        The data platform may support storing files in an <strong>object storage</strong>, that can be further processed or analyzed.
                        <br>
                        <em>Examples for object storage: Google Object Store, AWS S3, Git</em>
                    </p>
                    <p>
                        For high-volume real time data, it might be inappropriate to store all events permanently.
                        A <strong>streaming platform</strong> with short-time retention can be the suitable technology to perform real time analytics.
                        <br>
                        <em>Examples for streaming platforms: Apache Kafka, Confluent, AWS Kinesis</em>
                    </p>
                </div>
            </div>
        </div>

        <div class="card mt-5">
            <div class="card-content">
                <div class="content">
                    <h4 class="title is-4">Data Platform: Data Catalog</h4>
                    <p>
                        TBD
                    </p>
                </div>
            </div>
        </div>

        <div class="card mt-5">
            <div class="card-content">
                <div class="content">
                    <h4 class="title is-4">Data Platform: Policy Automation</h4>
                    <p>
                        TBD
                    </p>
                </div>
            </div>
        </div>


        <div class="card mt-5">
            <div class="card-content">
                <div class="content">
                    <h4 class="title is-4">Data Platform: Operations</h4>
                    <p>
                        TBD
                    </p>
                </div>
            </div>
        </div>
        -->

        <div class="card mt-5">
            <div class="card-content">
                <div class="content">
                    <h4 class="title is-4">Enabling Team</h4>
                    <p>
                        The enabling team spreads the idea of data mesh within the organization.
                        In the beginning of data mesh adoption, a lot of explanatory efforts will be required and the enabling team can act as data mesh advocates.
                        They help domain teams <a href="#domain-teams-journey">on their journey to become a full member of the data mesh</a>.
                        The enabling team consists of specialists with extensive knowledge on
                        data analytics, data engineering, and the self-serve data platform.
                    </p>
                    <p>
                        A member of the enabling team temporarily joins a domain team for a limited time span like a month as an
                        <strong>internal consultant</strong> to understand the team’s needs, establish a learning environment,
                        upskill the team members in data analytics, and guide them on how to use the self-serve data platform.
                        They don’t create data products by themselves.
                    </p>
                    <p>
                        In between their consulting engagements, they <strong>share learning
                        materials</strong> such as walking skeletons, examples, best practices, tutorials, or even podcasts.
                    </p>
                </div>
            </div>
        </div>

    </section>


    <section class="section is-medium">
        <h2 class="title is-1">Mesh</h2>
        <div class="content">
            <p class="mt-5">
                The <em>mesh</em> emerges when teams use other domain's data products.
                Using data from upstream domains simplifies data references and lookups (such as getting an article's price),
                while data from downstream domains enables analyzing effects, e.g. for A/B tests (such as changes in the conversion rate).
                Data from multiple other domains can be aggregated to build comprehensive reports and new data products.
            </p>
            <p>
                Let's look at a simplified e-commerce example:
            </p>

            <div class="block">
                <a href="images/mesh.png.webp" class="glightbox">
                    <img src="images/mesh.png.webp" alt="Domains access data products from other domains">
                </a>

            </div>

            <p>
                Domains can be classified by data characteristics and data product usage. We adopt Zhamak Dehghani’s classification:
            </p>

            <h2>Source-aligned</h2>
            <p>
                In this example, an online shop is subdivided into domains along the customer journey, from <em>product search</em> over <em>checkout</em> to <em>payment</em>.
                In a data mesh, these domains publish their data as data products, so others can access them.
                The engineers do analytics on their own data to improve their operational systems and validate the business value of new features.
                They use domain neighbor’s data to simplify their queries and get insights on effects in downstream domains.
                These domain data can be referred to as <strong>source-aligned</strong>, as most of their published data products correspond closely to the <em>domain events</em> and <em>entities</em> generated in their operational systems.
            </p>

            <h2>Aggregate</h2>
            <p>
                For <a href="https://teamtopologies.com/key-concepts">complicated subsystems&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a>, it can be efficient that a team focuses solely on delivering a data product that is <strong>aggregated</strong> of various data products from other domains.
                A typical example is a 360° customer view that includes relevant data from multiple domains, such as account data, orders, shipments, invoices, returns, account balance, and internal ratings.
                With respect to different bounded contexts, a comprehensive 360° customer view is hard to build, but it might be useful for many other domains.
                Another example for a complicated subsystem is building sophisticated ML models that require enhanced data science skills.
                It may be sensible that a data scientists team develops and trains a recommendation model by using data from checkout and the 360° customer view, and another team uses this model and focuses to present the calculated recommendations in the online shop or in promotional emails.
            </p>

            <h2>Consumer-aligned</h2>
            <p>
                In a company, there are also business departments that need data from the whole value stream to make sensible decisions, with people working in these departments are business experts but not engineers or technology-savvy.
                Management and controlling requires detailed reports and KPIs from all domains to identify strengths and deviations.
                Marketing does funnel and web analysis over all steps in the customer journey in their own optimized tools, such as Google Analytics or Adobe Analytics.
                In these domains, the data model is optimized for a specific department's needs and can therefore be described as <strong>consumer-aligned</strong>.
                Consumer-aligned reports were often one of the main tasks of central data teams.
                With data mesh, (new) consumer-aligned domain teams focus on fulfilling data needs of one specific business domain, allowing them to gain deep domain knowledge and constantly develop better analytical results.
                Business and IT grow closer together, either by building integrated domain teams or by having engineering teams that provide domain data as a service for the business, e.g., to support C-level or controlling.
                Their data are typically used for their analytics and reports, but does not need to be published and managed as data products for other domains.
            </p>
        </div>
    </section>


    <section class="section is-medium">
        <h2 class="title is-1" id="tech-stacks">Tech Stacks</h2>
        <div class="notification is-info is-light">
            Data mesh is primarily an organizational approach, and that's why you can't buy a data mesh from a vendor. Technology, however, is important still as it acts as an enabler for data mesh, and only useful and easy to use solutions will lead to domain teams' acceptance. The available offerings of cloud providers already provide a sufficient set of good self-serve data services to let you form a data platform for your data mesh. We want to show which services can be used to get started.
        </div>
        <div class="content">
            <p>
                There are a lot of different ways to implement a data mesh architecture.
                Here is a selection of typical tech stacks that we saw:
            </p>

            <ul>
                <li><a href="tech-stacks/google-cloud-bigquery.html">Google Cloud BigQuery</a></li>
                <li><a href="tech-stacks/aws-s3-athena.html">AWS S3 and Athena</a></li>
                <li><a href="tech-stacks/azure-synapse-analytics.html">Azure Synapse Analytics</a></li>
                <li><a href="tech-stacks/dbt-snowflake.html">dbt and Snowflake</a></li>
                <li><a href="tech-stacks/databricks.html">Databricks</a></li>
                <li><a href="tech-stacks/minio-trino.html">MinIO and Trino</a></li>
                <li><a href="tech-stacks/sap.html">SAP</a></li>
                <li>Starburst Enterprise (TBD)</li>
            </ul>
            <p>
                If you want to share your tech stack here, feel free to <a href="#authors">reach out to us</a>.
            </p>
        </div>
    </section>


    <section class="section is-medium">
        <div class="columns is-vcentered">
            <div class="column content">
            </div>

            <div class="column content">
                <h2 class="title is-1" id="domain-teams-journey">Domain Team’s Journey</h2>
            </div>
        </div>

        <div class="columns">
            <div class="column">
                <figure class="image">
                    <a href="images/domainteamjourney.png.webp" class="glightbox">
                        <img src="images/domainteamjourney.png.webp"
                             alt="The five levels of the domain team's journey: (Level 0) No Data Analytics; (Level 1) Operational Database Queries; (Level 2) Analyze Own Data; (Level 3) Analyze Cross-domain Data; (Level 4) Publish Data as a Product">
                    </a>
                </figure>
            </div>

            <div class="column content">
                <p>
                    Just as the <a href="#data-teams-journey">data team has a journey to go on</a>,
                    each of your domain teams has to go on a journey to become a contributing part
                    of your data mesh as well.
                    Each team can start their journey whenever they are ready and at their own pace.
                    The benefits arise already along the journey.
                    Teams will quickly gain from first data-driven decisions, starting an avalanche
                    to use more and better data for even deeper insights.
                    The data mesh evolves with each team that shares their data as products,
                    enabling data-driven innovation.
                </p>
                <p>
                    To make this journey successful, the team needs three things: a clear data mesh
                    vision from top management to get everybody moving in the same direction, a
                    supportive environment including an easy-to-use self-serve data platform to get
                    the engineering team on a learning path toward data analytics, and a high trust
                    environment to walk the journey in their own way and pace.
                </p>
                <p>So let’s start your journey!</p>
            </div>
        </div>

        <div class="columns is-vcentered mt-6">
            <div class="column content">
                <h3 class="title is-3"><span class="tag">Level 0</span> No Data Analytics</h3>
            </div>
        </div>

        <div class="columns reverse-columns is-vcentered">
            <div class="column content">
                <p>
                    Your team is responsible for a domain and builds and operates <a
                    href="https://scs-architecture.org/">self-contained systems&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> including the necessary
                    infrastructure.
                    It was quite an effort to build these systems, and you were highly focused on
                    delivery excellence.
                    These operational systems now generate domain data.
                </p>
                <p>
                    Data analytics was just not relevant.
                </p>
            </div>
            <div class="column">
                <figure class="image">
                    <a href="images/domainteamjourney_level0.png.webp" class="glightbox">
                        <img src="images/domainteamjourney_level0.png.webp"
                             alt="Level 0: No Data Analytics">
                    </a>
                </figure>
            </div>
        </div>

        <div class="columns is-vcentered mt-6">
            <div class="column content">
                <h3 class="title is-3"><span class="tag">Level 1</span> Operational Database Queries
                </h3>
            </div>
        </div>

        <div class="columns reverse-columns is-vcentered">
            <div class="column content">
                <p>
                    Being in production, you probably have to investigate an incident and need to
                    analyze how many customers are affected.
                    Also, some stakeholders might have questions regarding your data, such as "Which
                    in-stock articles haven’t been sold in the last six months?"
                    or "What were the shipping times during the last Black Week?"
                    To answer all these questions, you send analytical queries to your operational
                    database.
                    Over time, you also do some first explorative analytics to get a deeper
                    understanding of your system’s behavior.
                </p>
                <p>
                    This increases load on your production database, and you might be tempted to
                    change the production database to better support your analytical queries,
                    like creating additional indices.
                    You might offload the additional load to read replicas.
                    But analytical queries are still slow and cumbersome to write.
                </p>
            </div>
            <div class="column">
                <figure class="image">
                    <a href="images/domainteamjourney_level1.png.webp" class="glightbox">
                        <img src="images/domainteamjourney_level1.png.webp"
                             alt="Level 1: Operational Database Queries">
                    </a>
                </figure>
            </div>
        </div>


        <div class="columns is-vcentered mt-6">
            <div class="column content">
                <h3 class="title is-3"><span class="tag">Level 2</span> Analyze Own Data</h3>
            </div>
        </div>
        <div class="columns reverse-columns is-vcentered">
            <div class="column content">
                <p>
                    With the pains of slow and hard-to-write analytical queries in the back of your
                    mind, you try out the self-serve data platform that’s being promoted by the data
                    platform team.
                    For example, you now have access to Google BigQuery.
                    On this platform, your team starts to build an analytical data model by ingesting
                    messages from Kafka. This is your first data product.
                    The data platform allows you to analyze data covering your own systems with maintainable and
                    fast queries, while keeping the schemas of your operational databases untouched.
                    You learn how to structure, preprocess, clean, analyze, and visualize analytical
                    data&mdash;that’s a lot to learn even though most is SQL, which you are already
                    familiar with.
                </p>
                <p>
                    As questions regarding your own data can now be answered quickly, you and your
                    product owner now enter the cycle of making data-driven decisions: define
                    hypotheses and verify with data.
                </p>

            </div>
            <div class="column">
                <figure class="image">
                    <a href="images/domainteamjourney_level2.png.webp" class="glightbox">
                        <img src="images/domainteamjourney_level2.png.webp"
                             alt="Level 2: Analyze Own Data">
                    </a>
                </figure>
            </div>
        </div>

        <div class="columns is-vcentered mt-6">
            <div class="column content">
                <h3 class="title is-3"><span class="tag">Level 3</span> Analyze Cross-domain Data
                </h3>
            </div>
        </div>

        <div class="columns reverse-columns is-vcentered">
            <div class="column content">
                <p>
                    Analyzing your own domain data is a great start, but combining it with data from
                    other domains is where the magic begins.
                    It allows you to get a comprehensive view despite the decentralization of data.
                    Examples are A/B tests of the effect of a UI change to the conversion rate or
                    building up machine learning models for fraud detection that include previous
                    purchasing history and current click stream behavior.
                    This requires that other teams share their data products in a way that your team can
                    discover, access, and use it.
                    This is when the mesh begins to form itself.
                </p>
                <p>
                    When a team becomes a consuming member of the data mesh, it starts to gain
                    interest in the interoperability and governance of the data mesh.
                    Ideally, the team will send a representative to the data mesh governance group.
                </p>
                <p>
                    In case you are the first team, you may have to skip this step for now and move
                    on to level 4 and be the first to provide data for others.
                </p>
            </div>
            <div class="column">
                <figure class="image">
                    <a href="images/domainteamjourney_level3.png.webp" class="glightbox">
                        <img src="images/domainteamjourney_level3.png.webp"
                             alt="Level 3: Analyze Cross-domain Data">
                    </a>
                </figure>
            </div>
        </div>


        <div class="columns is-vcentered mt-6">
            <div class="column content">
                <h3 class="title is-3"><span class="tag">Level 4</span> Publish Data Contracts
                </h3>
            </div>
        </div>
        <div class="columns reverse-columns is-vcentered">
            <div class="column content">
                <p>
                    Based on other teams' needs, you share your data with others by publishing data contracts.
                    For example, you provide the confirmed, rejected, and aborted orders so others
                    can correlate their events to the conversion rate.
                    Instead of just being a consumer of data products, you become a  publisher of data
                    products.
                    You generate value for other teams.
                    But at the same time, it increases your responsibility and operational duties in
                    the long term.
                </p>
                <p>
                    Published data products must comply with the global policies defined by the federated
                    governance group.
                    You have to know and understand the current global policies.
                    Now, at the latest, you need to participate in and contribute to the federated
                    governance group.
                </p>
            </div>
            <div class="column">
                <figure class="image">
                    <a href="images/domainteamjourney_level4.png.webp" class="glightbox">
                        <img src="images/domainteamjourney_level4.png.webp"
                             alt="Level 4: Publish Data Contracts">
                    </a>
                </figure>
            </div>
        </div>
    </section>


    <section class="section is-medium">

        <div class="columns">
            <div class="column content">
            </div>
            <div class="column">
                <h2 class="title is-1" id="data-teams-journey">Data Team’s Journey</h2>
            </div>
        </div>

        <div class="columns is-vcentered">
        <div class="column">
            <a href="images/datateamjourney.png.webp" class="glightbox">
                <img src="images/datateamjourney.png.webp" alt="Data Team's Journey: From a central data team toward an enablement team, data platform team, and (new) domain teams with data expertise">
            </a>
        </div>
        <div class="column">

            <div class="content">

            <p>
                Data mesh is primarily an organizational construct and fits right into the <a href="https://teamtopologies.com/key-concepts">principles of team topologies&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a>.
                It shifts the responsibilities for data toward domain teams which are supported by a data platform team and a data enabling team.
                Representatives of all teams come together in a federated governance group to define the common standards.
            </p>
            <p>
                Today, in many organizations a central data team is responsible for a wide range of analytical tasks, from data engineering and managing data infrastructure to creating C-level reports.
                Such a central data team suffers from cognitive overload, including domain, technical, and methodical knowledge.
                data mesh mitigates this.
            </p>
            <p>
                Data mesh offers new perspectives for members of the central data team as their analytical and data engineering skills remain highly necessary.
                For example, they are a perfect fit to establish the data platform for people that prefer to work on the infrastructure.
                Some of them can form a data enabling team to act as internal consultants, <a href="#domain-teams-journey">helping domain teams on their journey</a>.
                Regardless of their new roles, many of them will meet again in the data mesh federated governance group to shape the future of the data mesh.</p>
            </div>
        </div>
        </div>

        <div class="columns reverse-columns is-vcentered">

            <div class="column">
                <div class="content">
        <p>
            The real mind shift, however, happens when founding new data-centric domains as shown in the figure above.
            Let’s look at typical management reports that large central data teams usually produce based on monolithic data warehouses or data lakes.
            With data mesh, the data engineers who created those management reports build a new domain team together with a dedicated product owner.
            As engineers of the new domain team, they now can focus on their new domain and their consumers.
            This allows them to gain deep domain knowledge over time, resulting in better reports and continuous optimizations.
            In addition, they switch from using that monolith data warehouse to data products from other domains.
            This switch is a gradual process driven by the demand for data products, accelerating the forming of a data mesh.
            The product owner negotiates with other domain teams about the required data products and makes sure that the reports and other products the new domain team will build in the future fulfill the needs of the business.
        </p>
        <p>
            As existing domain teams on their journey do more and more data analytics, another perspective for members of the central data team is to join one of those teams.
            With their existing knowledge, they can accelerate the domain teams’ journeys toward a data mesh by spreading and teaching their knowledge and skills to the others in the team.
            It is important that they become full members of the team and not founding a data sub-team within the domain team.
            In addition to their knowledge and skills, the data engineers may also bring responsibilities and artifacts from the central data team to their domain teams.
            For example, customer profiling, which was previously done by the central data team, will move into the responsibility of the recommendation domain team.
        </p>
        <p>
            The data scientists, typically, are centrally organized as well.
            That’s why their future, organizational-wise, is quite similar to that of the central data team.
            The data products in the data mesh they focus on are machine learning features and models.
            When joining an existing domain team, such a machine learning model might be fully integrated in a microservice.
            So, data mesh enables such machine-learning-based services because the required <a href="https://ml-ops.org">MLOps&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> capabilities can be easily built on top of the data mesh.
        </p></div>
        </div>
            <div class="column">
                <a href="images/newdomainteam.png.webp" class="glightbox">
                    <img src="images/newdomainteam.png.webp" alt="Migrating an existing data product and some data engineers from the central data team to a (new) domain team">
                </a>
            </div>
        </div>

    </section>

    <section class="section is-medium">
        <div class="content">
            <h2 class="title is-1" id="faq">FAQ</h2>

            <h4 class="title">So, what's really behind the hype?</h4>
            <p>
                Data mesh is primarily an organizational change.
                The responsibilities of data are shifted closer to the business value stream.
                This enables faster data-driven decisions and reduces barriers for data-centric innovations.
            </p>


            <h4 class="title">Who has actually implemented a data mesh?</h4>
            <p>
                There is a comprehensive collection of <a href="https://datameshlearning.com/user-stories/">user journey stories</a> from the Data Mesh Learning community
                that covers data mesh examples from many different industries.
            </p>

            <h4 class="title">Is Data Mesh for my company?</h4>
            <p>
                It depends, of course.
                There are a few prerequisites that should be in place:
                You should have modularized your software system following domain-driven design principles or something similar.
                You should have a good number (5+) of independent domain teams that have their systems already running in production.
                And finally, you should trust your teams to make data-driven decisions on their own.
            </p>

            <h4 class="title">How to get started?</h4>
            <p>
                Start small and agree on the big picture.
                Find two domain teams (that are around level 2) that have a high value use case where one team needs data from the other team.
                Let one team build a data product (level 4) and another team use that data product (level 3).
                You don’t need a sophisticated data platform yet.
                You can start sharing the files via AWS S3, a Git repository, or use a cloud-based database, such as Google BigQuery.
            </p>

            <h4 class="title">When should I avoid a Data Mesh?</h4>
            <p>
                There are some indicators when a data mesh approach might not be suitable for you, including:
            </p>
            <ul>
                <li>You are too small and don’t have multiple independent engineering teams.</li>
                <li>You have low-latency data requirements. Data Mesh is a network of data. If you need to optimize for low-latency, invest in a more integrated data platform.</li>
                <li>You are happy with your monolithic highly integrated system (such as SAP). It might be more efficient to use their analytical platform.</li>
            </ul>

            <h4 class="title">What is Data Mesh not?</h4>
            <ul>
                <li>Data Mesh is not a Silver Bullet.</li>
                <li>Data Mesh is not a religion.</li>
                <li>Data Mesh is not plug-and-play.</li>
                <li>Data Mesh is not a product you can just buy.</li>
                <li>Data Mesh is not a data-only platform.</li>
                <li>Data Mesh cannot be implemented by the data team alone.</li>
                <li>Data Mesh is not a concept for operational data.</li>
                <li>Data Mesh is not data virtualization.</li>
                <li>Data Mesh is not the successor to Data Warehouse or Data Lake.</li>
                <li>Data Mesh cannot be rapidly implemented as Big Bang.</li>
                <li>Data mesh is not a service mesh for data.</li>
                <li>And data mesh has absolutely nothing to do with blockchain.</li>
            </ul>

            <h4 class="title">Is the Data Mesh a generic solution to a distributed data architecture?</h4>
            <p>
                No.
            </p>
            <p>
                By definition, data mesh does not include data products used for serving real-time needs.
                Data mesh focuses on analytical use cases.
            </p>


            <!--            <h4 class="title">How to integrate data from legacy systems?</h4>-->
<!--            <p>-->
<!--                Option 1: A team needs to wrap legacy data into a data product.-->
<!--                Option 2: ACL-->
<!--            </p>-->

<!--            <h4 class="title">When should I avoid Data Mesh for my company?</h4>-->
<!--            <p>-->
<!--                TBD-->
<!--            </p>-->


<!--            <h4 class="title">Is Data Mesh DataOps?</h4>-->
<!--            <p>-->
<!--                TBD-->
<!--            </p>-->
            
            
            <h4 class="title">Data Product: What data to include in a data product? Should a data product include other domain's data?</h4>
            <p>
                Data that is created and owned by a domain are prime candidates, and the domain team should be encouraged to publish them in an appropriate, cleaned and managed form.
            </p>
            <p>
For source-aligned domains, we mostly would argue to include reference IDs.
It is OK to include other domain's data, if the data was transformed, is the basic for business decisions or the exact state of the data at a processing time was relevant. In fact, these are cases, when the processing domain takes ownership for these data based on business cases.
            </p>
            <p>
Aggregate domains and consumer-aligned domains can include all foreign data that are relevant for their consumers' use cases.
            </p>
            

            <h4 class="title">What's the difference between data mesh and data fabric?</h4>
            <p>
                At first, <a href="https://www.ibm.com/analytics/data-fabric">data fabric&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> looks similar to data mesh because it offers a similar self-serve data platform.
                Looking deeper, it turns out that data fabric is a central and domain-agnostic approach, which is in strong contrast to the domain-centric and decentralized approach of data mesh.
                <a href="https://www.datanami.com/2021/10/25/data-mesh-vs-data-fabric-understanding-the-differences/">More in this comparison article&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a>.
            </p>

            <h4 class="title">What might a journey be for teams who operate commercial off-the-shelf (COTS) systems?</h4>
            <p>
                Many COTS systems (such as Salesforce, SAP, Shopify, Odoo) provide domain optimized analytical capabilities.
                So the journey for domain teams starts directly from <a href="https://www.datamesh-architecture.com/#level-2-analyze-own-data">level 2</a>.
            </p>
            <p>
                The challenge is to integrate data products from other domains (<a href="https://www.datamesh-architecture.com/#level-3-analyze-cross-domain-data">level 3</a>, which may be skipped if not needed)
                and to publish data products for other domains (<a href="https://www.datamesh-architecture.com/#level-4-publish-data-as-a-product">level 4</a>).
                The system’s data need to be exported to the data platform and managed as data products, conforming the global policies.
                As data models evolve with system updates, an anti-corruption layer is a must, e.g., as a cleaning step.
            </p>

            <h4 class="title">How might externally acquired datasets be part of a data mesh?</h4>
            <p>
                Typical examples: Price-Databases or Medical Studies.
                A team needs to own this dataset and bring it into the datamesh. If this is not a very technical team, the data-platform should offer an easy self-service to upload files and provide Meta-Data. An Excel API or Google Sheets might also be an option here.
            </p>

<!--            <h4 class="title">How is Machine Learning related to Data Mesh?</h4>-->
<!--            <p>-->
<!--                TBD, @Larysa-->
<!--            </p>-->

            <h4 class="title">How did you draw the diagrams?</h4>
            <p>
                We got this question quite a lot, so we are happy to share our tooling:<br>
                We use <a href="https://www.diagrams.net">diagrams.net&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a> with "Sketch" style and
                <a href="https://streamlinehq.com">Streamline Icons&nbsp;<i class="fa-solid fa-arrow-up-right-from-square"></i></a>.
                We automate the conversion to PNG, SVG and WebP with a little <a href="https://github.com/datamesh-architecture/datamesh-architecture.com/blob/main/convert">script</a>.

            </p>

            <h4 class="title">What are your questions?</h4>
            <p>
                If you have any more questions, we encourage you to <a href="https://github.com/datamesh-architecture/datamesh-architecture.com/discussions">discuss with us on GitHub</a> or <a href="#authors">reach out to us directly</a>.
                But be warned:
                Your question might end up in the FAQ. :-)
            </p>

        </div>

    </section>

    <section class="section is-medium">

        <h2 class="title is-1 mb-6" id="learn-more">Learn more</h2>

        <div class="card mb-3">
            <div class="card-content">
                <div class="media">
                    <div class="media-left">
                        <span class=" icon is-large">
                        <i class="fa-solid fa-user-group fa-2x"></i>
                        </span>
                    </div>
                    <div class="media-content">
                        <p class="title is-4">
                            <a href="https://datameshlearning.com/">Data Mesh Learning Community</a>
                        </p>
                        <p class="subtitle is-6">
                            by Scott Hirleman
                        </p>
                    </div>
                </div>

                <div class="content">
                    Scott started the Data Mesh Learning community <a href="https://datameshlearning.com/">website</a>,
                    a <a href="https://launchpass.com/data-mesh-learning">Slack channel</a>,
                    and issues a <a href="https://datameshlearning.substack.com/">newsletter</a> every two weeks.
                    He collects articles and experience reports about data mesh, and even has a <a href="https://daappod.com/data-mesh-radio/">podcast</a> on the topic.
                    It is worth subscribing to stay informed.
                </div>
            </div>
        </div>

        <div class="card mb-3">
            <div class="card-content">
                <div class="media">
                    <div class="media-left">
                        <span class=" icon is-large has-text-info">
                        <i class="fa-solid fa-file-lines fa-2x"></i>
                        </span>
                    </div>
                    <div class="media-content">
                        <p class="title is-4"><a href="https://martinfowler.com/articles/data-monolith-to-mesh.html">How to Move Beyond a Monolithic Data Lake to a Distributed Data Mesh</a></p>
                        <p class="subtitle is-6">by Zhamak Dehghani</p>
                    </div>
                </div>

                <div class="content">
                    In this article, Zhamak Dehghani introduced the idea and core principles of data mesh. It is a must-read for everyone in this field. In her <a href="https://martinfowler.com/articles/data-mesh-principles.html">follow-up article</a>, Zhamak gets into more detail.
                </div>
            </div>
        </div>

        <div class="card mb-3">
            <div class="card-content">
                <div class="media">
                    <div class="media-left">
                        <p class="image is-64x64">
                            <img src="images/book_datamesh_dehghani.jpg">
                        </p>
                    </div>
                    <div class="media-content">
                        <p class="title is-4">
                            <a href="https://www.oreilly.com/library/view/data-mesh/9781492092384/">Data Mesh</a>
                        </p>
                        <p class="subtitle is-6">by Zhamak Dehghani</p>
                    </div>
                </div>

                <div class="content">
                    Zhamak's book about data mesh.
                    The book not only discusses the principles of data mesh, but also presents an execution strategy.
                </div>
            </div>
        </div>

        <div class="card mb-3">
            <div class="card-content">
                <div class="media">
                    <div class="media-left">
                        <p class="image is-64x64">
                            <img src="images/book_datamesh_dehghani_german.jpeg">
                        </p>
                    </div>
                    <div class="media-content">
                        <p class="title is-4">
                            <a href="https://oreilly.de/produkt/data-mesh/">Data Mesh</a>
                        </p>
                        <p class="subtitle is-6">by Zhamak Dehghani</p>
                    </div>
                </div>

                <div class="content">
                    Zhamak's book about data mesh, but in German and in color.
                    Translated by Jochen and Simon, who are co-authors of this website.
                </div>
            </div>
        </div>

        <div class="card mb-3">
            <div class="card-content">
                <div class="media">
                    <div class="media-left">
                        <p class="image is-64x64">
                            <img src="images/book_datameshinaction.jpg">
                        </p>

                    </div>
                    <div class="media-content">
                        <p class="title is-4">
                            <a href="https://www.manning.com/books/data-mesh-in-action">Data Mesh in Action</a>
                        </p>
                        <p class="subtitle is-6">
                            by Jacek Majchrzak, Sven Balnojan, and Marian Siwiak
                        </p>
                    </div>
                </div>

                <div class="content">
                    Data Mesh in Action is a great hands-on book.
                    We really liked the MVP hat shows how to start implementing a data mesh.
                </div>
            </div>
        </div>

        <div class="card mb-3">
            <div class="card-content">
                <div class="media">
                    <div class="media-left">
                        <span class=" icon is-large has-text-info">
                        <i class="fa-solid fa-file-lines fa-2x"></i>
                        </span>
                    </div>
                    <div class="media-content">
                        <p class="title is-4">
                            <a href="https://www.innoq.com/en/articles/2022/04/data-mesh-decentralized-data-analytics-for-software-engineers/">Data Mesh: Decentralized Data Analytics for Software Engineers</a>
                        </p>
                        <p class="subtitle is-6">
                            by Jochen Christ
                        </p>
                    </div>
                </div>

                <div class="content">
                    Intro article to data mesh with examples in the Google Cloud.
                </div>
            </div>
        </div>

    </section>

    <section class="section is-medium">

        <h2 class="title is-1" id="authors">Authors</h2>

        <div class="columns is-gapless">
            <div class="column">


            <div class="card" style="box-shadow: none;">
            <div class="card-content">
                <div class="media">
                    <div class="media-left">
                        <figure class="image is-64x64">
                            <img class="is-rounded" src="images/jochen.webp" alt="Jochen Christ">
                        </figure>
                    </div>
                    <div class="media-content">
                        <p class="title is-4"><a href="https://www.innoq.com/en/staff/jochen-christ/">Jochen</a></p>
                        <p class="subtitle is-6"><a href="https://twitter.com/jochen_christ">@jochen_christ</a></p>
                    </div>
                </div>

                <div class="content">
                    Jochen Christ works as tech lead at INNOQ and is a specialist for self-contained systems and data mesh.
                    Jochen is maintainer of <a href="https://www.http-feeds.org">HTTP Feeds</a>, <a href="https://whichjdk.com">Which JDK</a>, and co-author of <a href="https://www.remotemobprogramming.org">Remote Mob Programming</a>.
                </div>
            </div>
            </div>
            </div>
            <div class="column">

            <div class="card" style="box-shadow: none;">
                <div class="card-content">
                    <div class="media">
                        <div class="media-left">
                            <figure class="image is-64x64">
                                <img class="is-rounded" src="images/larysa.webp" alt="Larysa Visengeriyeva">
                            </figure>
                        </div>
                        <div class="media-content">
                            <p class="title is-4"><a href="https://www.innoq.com/en/staff/larysa-visengeriyeva/">Larysa</a></p>
                            <p class="subtitle is-6"><a href="https://twitter.com/visenger">@visenger</a></p>
                        </div>
                    </div>

                    <div class="content">
                        Dr. Larysa Visengeriyeva received her doctorate in Augmented Data Quality Management at the TU Berlin. At INNOQ she is working on the <a href="https://ml-ops.org/">operationalization of Machine Learning (MLOps)</a>.
                    </div>
                </div>
            </div>

            </div>
            <div class="column">

            <div class="card" style="box-shadow: none;">
                <div class="card-content">
                    <div class="media">
                        <div class="media-left">
                            <figure class="image is-64x64">
                                <img class="is-rounded" src="images/simon.webp" alt="Simon Harrer">
                            </figure>
                        </div>
                        <div class="media-content">
                            <p class="title is-4"><a href="https://www.innoq.com/en/staff/dr-simon-harrer/">Simon</a></p>
                            <p class="subtitle is-6"><a href="https://twitter.com/simonharrer">@simonharrer</a></p>
                        </div>
                    </div>

                    <div class="content">
                        Dr. Simon Harrer is a curious person working at INNOQ who likes to share his knowledge.
                        He's a serial co-author of <a href="https://java.by-comparison.com">Java by Comparison</a>, <a href="https://remotemobprogramming.org">Remote Mob Programming</a>, <a href="https://gitops.tech">GitOps</a>, and, most recently, <a href="https://datamesh-architecture.com/">Data Mesh</a>.
                    </div>
                </div>
            </div>
            </div>

        </div>


    </section>

    <section class="section is-medium">

        <div class="content">
            <h2 class="title is-1" id="contributors">Contributors</h2>
            <p>
                A ton of people helped us curate our content through their great feedback. Special thanks to:
                Anja Kammer,
                Benedikt Stemmhildt,
                Benjamin Wolf,
                Eberhard Wolff,
                Gernot Starke,
                Jan Bode,
                Jan Schwake,
                Julian Schikarski,
                Jörg Müller,
                Markus Harrer,
                Matthias Geiger,
                Philipp Beyerlein,
                Rainer Jaspert,
                Stefan Tilkov,
                Tammo van Lessen, and
                Theo Pack.
            </p>
            <p>
                And if you have feedback for us as well, feel free to <a href="https://github.com/datamesh-architecture/datamesh-architecture.com/discussions">discuss with us on GitHub</a> or <a href="#authors">reach out to us directly</a>!
            </p>
        </div>

    </section>

</div>

<footer class="footer">
    <div class="content has-text-centered">
        <p>
            <a href="https://www.innoq.com">
                <img src="/images/supported-by-innoq--petrol-apricot.svg" alt="Supported by INNOQ" class="footer-logo" width="180" />
            </a>
        </p>
        <p>
            <a href="https://www.innoq.com/en/topics/data-mesh-workshop?ref=dma-footer">Workshop</a>&nbsp
            <a href="https://www.socreatory.com/de/trainings/datamesh?ref=dma-footer">Training</a>&nbsp
            <a href="https://www.innoq.com/en/impressum/">Legal Notice</a>&nbsp
            <a href="https://www.innoq.com/en/datenschutz/">Privacy</a>
        </p>
    </div>
</footer>

<script src="/js/navigation.js"></script>

<script async defer src="https://scripts.simpleanalyticscdn.com/latest.js"></script>
<script async src="https://scripts.simpleanalyticscdn.com/auto-events.js"></script>
<noscript><img src="https://queue.simpleanalyticscdn.com/noscript.gif" alt="" referrerpolicy="no-referrer-when-downgrade" /></noscript>

<link rel="stylesheet" href="css/glightbox.css" />
<script src="js/glightbox.js"></script>
<script type="text/javascript">
    const lightbox = GLightbox({});
</script>


<script src="js/anchor.min.js"></script>
<script>anchors.add();</script>
</body>

</html>