slides/structured-streaming-stateful-stream-processing.html

<!doctype html>
<html lang="en">
  <head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">

    <title>Apache Spark™ Workshop | Structured Streaming | Stateful Stream Processing</title>

    <meta name="description" content="Apache Spark™ Workshop | Structured Streaming | Stateful Stream Processing">
    <meta name="author" content="Jacek Laskowski">

    <link rel="stylesheet" href="reveal.js/css/reveal.css">
    <link rel="stylesheet" href="reveal.js/css/theme/beige.css">

    <!-- Theme used for syntax highlighting of code -->
    <link rel="stylesheet" href="reveal.js/lib/css/zenburn.css">

    <!-- Jacek: custom formatting -->
    <link rel="stylesheet" href="revealjs-css/jacek.css">

    <!-- Printing and PDF exports -->
    <script>
      var link = document.createElement('link');
      link.rel = 'stylesheet';
      link.type = 'text/css';
      link.href = window.location.search.match(/print-pdf/gi) ? 'reveal.js/css/print/pdf.css' : 'reveal.js/css/print/paper.css';
      document.getElementsByTagName('head')[0].appendChild(link);
    </script>
  </head>

  <body>
    <div class="reveal">

      <div class="footer">
        <footer style="font-size: small;">
          &copy; <a href="https://medium.com/@jaceklaskowski">Jacek Laskowski</a> 2019 / <a href="https://twitter.com/jaceklaskowski">@JacekLaskowski</a>
          / jacek@japila.pl
        </footer>
      </div>

      <div class="slides">

        <section class="intro" data-transition="zoom" id="home">
          <p>
            <img width="12%" style="background:none; border:none; box-shadow:none;" data-src="images/spark-logo.png">
            <img width="6%" src="images/jacek_laskowski_20141201_512px.png" style="border: 0">
          </p>
          <h1 style="font-size: 3.17em;">Stateful Stream Processing</h1>
          <h3>Apache Spark 2.4.3 / Structured Streaming</h3>

          <hr />
          <h4 style="font-size: smaller;">
            <a href="https://twitter.com/jaceklaskowski">@jaceklaskowski</a> / <a href="https://stackoverflow.com/users/1305344/jacek-laskowski">StackOverflow</a> / <a href="https://github.com/jaceklaskowski">GitHub</a>
            <br>
            The "Internals" Books: <a href="https://bit.ly/apache-spark-internals">Apache Spark</a> / <a href="https://bit.ly/spark-sql-internals">Spark SQL</a> / <a href="https://bit.ly/spark-structured-streaming">Spark Structured Streaming</a>
          </h4>
        </section>

        <section>
          <section id="speaker" style="font-size: 90%" data-markdown>
            <textarea data-template>
              <p><img width="12%" src="images/jacek_laskowski_20141201_512px.png" style="border: 0"></p>

              * **Jacek Laskowski** is a freelance IT consultant
              * Specializing in **Spark**, Kafka, Kafka Streams, Scala
              * Development | Consulting | Training
              * Among contributors to <a href="https://github.com/apache/spark/graphs/contributors">Apache Spark</a>
              * Contact me at **jacek@japila.pl**
              * Follow [@JacekLaskowski](https://twitter.com/jaceklaskowski) on twitter <br>for more #ApacheSpark, #ApacheKafka, #KafkaStreams
            </textarea>
          </section>
          <section id="gitbooks">
            <p><img width="12%" src="images/jacek_laskowski_20141201_512px.png" style="border: 0"></p>
            <div style="text-align: center">
              <p>Jacek is best known by the online "Internals" books:</p>
              <p>
                <ol>
                  <li><a href="https://bit.ly/apache-spark-internals">The Internals of Apache Spark</a></li>
                  <li><a href="https://bit.ly/spark-sql-internals">The Internals of Spark SQL</a></li>
                  <li><a href="https://bit.ly/spark-structured-streaming">The Internals of Spark Structured Streaming</a></li>
                  <li><a href="https://bit.ly/kafka-streams-internals">The Internals of Kafka Streams</a></li>
                  <li><a href="https://bit.ly/apache-kafka-internals">The Internals of Apache Kafka</a></li>
                </ol>
              </p>
            </div>
          </section>
          <section id="stackoverflow-spark-structured-streaming">
            <div style="text-align: center">
              <p>
                Jacek is "active" on <a href="http://stackoverflow.com/users/1305344/jacek-laskowski">StackOverflow</a> 🥳
                <br />
                (<b>spark-structured-streaming</b> tag)
              </p>
              <p>
                <a href="https://stackoverflow.com/tags/spark-structured-streaming/topusers"><img width="65%" src="images/jaceklaskowski-stackoverflow-spark-structured-streaming.png" style="border: 0"></a>
              </p>
            </div>
          </section>
          <section id="stackoverflow-apache-spark">
            <div style="text-align: center">
              <p>
                Jacek is "active" on <a href="http://stackoverflow.com/users/1305344/jacek-laskowski">StackOverflow</a> 🥳
                <br />
                (<b>apache-spark</b> tag)
              </p>
              <p>
                <a href="https://stackoverflow.com/tags/apache-spark/topusers"><img width="60%" src="images/jaceklaskowski-stackoverflow-apache-spark.png" style="border: 0"></a>
              </p>
            </div>
          </section>
        </section>

        <section id="agenda" style="font-size: 90%" data-markdown>
          <textarea data-template>
            ## Agenda

            1. [Structured Streaming](#/intro)
            1. [Stateful Stream Processing](#/stateful-stream-processing)
            1. [Streaming Aggregation](#/streaming-aggregation)
            1. [Streaming Watermark](#/streaming-watermark)
            1. [Streaming Join](#/streaming-join)
            1. [Arbitrary Stateful Streaming Aggregation](#/arbitrary-stateful-streaming-aggregation)
            1. [State Store](#/state-store)
          </textarea>
        </section>

        <section id="intro" data-markdown style="font-size: 80%">
          <textarea data-template>
            ## Structured Streaming

            1. **Structured Streaming** (aka **Spark Streams**) is the module of Apache Spark for stream processing
            1. Introduces **streaming queries**
              * Extension of batch queries in Spark SQL
              * Executed by Spark SQL engine
            1. Expressed using high-level declarative languages
              * **Dataset API**
              * **SQL**
              * Exactly as in Spark SQL <small>(almost)</small>
            1. Logical and physical query plans with operators are based on Spark SQL
            1. Supports **fault-tolerant state stores** for stateful operations (e.g. windowed aggregations and joins)
          </textarea>
        </section>

        <section>
          <section id="stateful-stream-processing" data-markdown style="font-size: 80%">
            <textarea data-template>
              ## Stateful Stream Processing <small>(1 of 2)</small>

              1. **Stateful Stream Processing** is a stream processing with some state (implicit or explicit)
              1. In Structured Streaming, a streaming query is stateful when it uses the following high-level declarative operators (or their SQL variants):
                * **Dataset.groupBy** for [Streaming Aggregation](#/streaming-aggregation)
                * **Dataset.rollup** for [Streaming Aggregation](#/streaming-aggregation)
                * **Dataset.cube** for [Streaming Aggregation](#/streaming-aggregation)
                * **Dataset.groupByKey** for [Streaming Aggregation](#/streaming-aggregation)
                * **Dataset.dropDuplicates** for Streaming Deduplication <small>(_not covered_)</small>
                * **Dataset.join** for [Streaming Join](#/streaming-join)
                * **Dataset.limit** for Streaming Limit <small>(_not covered_)</small>
            </textarea>
          </section>
          <section data-markdown style="font-size: 80%">
            <textarea data-template>
              ## Stateful Stream Processing <small>(2 of 2)</small>

              1. Any structured query (incl. streaming queries) becomes a logical query plan
                * High-level declarative operators simply create an **unanalyzed logical plan**
              1. Let's rephrase then what was said earlier
              1. In Structured Streaming, a streaming query is stateful when it uses the following logical operators:
                * **StateStoreRestoreExec** and **StateStoreSaveExec**
                * **StreamingDeduplicateExec**
                * **FlatMapGroupsWithStateExec**
                * **StreamingSymmetricHashJoinExec**
                * **StreamingGlobalLimitExec**
            </textarea>
          </section>
        </section>

        <section>
          <section id="streaming-aggregation" data-markdown style="font-size: 80%">
            <textarea data-template>
              ## Streaming Aggregation <small>(1 of 2)</small>

              1. **Streaming Aggregation** is a streaming query that uses the following high-level declarative operators (or their SQL variants):
                * **Dataset.groupBy**
                * **Dataset.rollup**
                * **Dataset.cube**
                * **Dataset.groupByKey**
              1. Creates **Aggregate** logical operator
              1. **window** function allows for **windowed streaming aggregation**
                * Tumbling windows
                * Sliding windows
            </textarea>
          </section>
          <section id="streaming-aggregation-demo" data-markdown style="font-size: 80%">
            <textarea data-template>
              ## Streaming Aggregation's Demo <small>(2 of 2)</small>

              1. Go to [Demo: Streaming Query for Running Counts](https://jaceklaskowski.gitbooks.io/spark-structured-streaming/spark-sql-streaming-demo-groupBy-running-count-complete.html)
            </textarea>
          </section>
        </section>

        <section>
          <section id="streaming-watermark" data-markdown style="font-size: 80%">
            <textarea data-template>
              ## Streaming Watermark <small>(1 of 2)</small>

              1. **Streaming Watermark** (**Event-Time Watermark**) of a stateful streaming query is a time threshold to wait for late and possibly out-of-order events until a streaming state (for a given key) can be considered final
              1. Used to mark events that are older than the threshold as "too late", and not "interesting" to update partial non-final aggregates
              1. Avoids unbounded streaming state
                * Prevents OOMEs
              1. Expressed with **Dataset.withWatermark** high-level operator
                * Defined on grouping expression(s) of a streaming aggregation (directly or using **window** standard function)
                * Creates **EventTimeWatermark** logical operator
            </textarea>
          </section>
          <section id="streaming-watermark-demo" data-markdown style="font-size: 80%">
            <textarea data-template>
              ## Streaming Watermark's Demo <small>(2 of 2)</small>

              1. Go to [Demo: Streaming Watermark with Aggregation in Append Output Mode](https://jaceklaskowski.gitbooks.io/spark-structured-streaming/spark-sql-streaming-demo-watermark-aggregation-append.html)
              1. Go to [Demo: Streaming Aggregation with Kafka Data Source](https://jaceklaskowski.gitbooks.io/spark-structured-streaming/spark-sql-streaming-demo-kafka-data-source.html) for event-time windows
            </textarea>
          </section>
        </section>

        <section>
          <section id="streaming-join" data-markdown style="font-size: 80%">
            <textarea data-template>
              ## Streaming Join <small>(1 of 2)</small>

              1. **Streaming Join** is a streaming query that uses **Dataset.join** or SQL's **JOIN** high-level query operators
              1. Creates **Join** logical operator
              1. Streaming joins can be **stateless** or **stateful**
                * Joins of a streaming query and a batch query (aka **stream-static joins**) are stateless and no state management is necessary
                * Joins of two streaming queries (aka **stream-stream joins**) are stateful and require streaming state (with a streaming watermark)
            </textarea>
          </section>
          <section id="streaming-join-demo" data-markdown style="font-size: 80%">
            <textarea data-template>
              ## Streaming Join's Demo <small>(2 of 2)</small>

              1. Go to [Demo: Streaming Join of Streaming Queries and StreamingSymmetricHashJoinExec Physical Operator](https://jaceklaskowski.gitbooks.io/spark-structured-streaming/spark-sql-streaming-demo-join-stream-stream-StreamingSymmetricHashJoinExec.html)
            </textarea>
          </section>
        </section>

        <section>
          <section id="arbitrary-stateful-streaming-aggregation" data-markdown style="font-size: 90%">
            <textarea data-template>
              ## Arbitrary Stateful Streaming Aggregation <small>(1 of 2)</small>

              1. **Arbitrary Stateful Streaming Aggregation** is a streaming query that uses the following **KeyValueGroupedDataset** operators:
                * **mapGroupsWithState**
                * **flatMapGroupsWithState**
              1. Creates **FlatMapGroupsWithState** logical operator
            </textarea>
          </section>
          <section data-markdown style="font-size: 90%">
            <textarea data-template>
              ## Arbitrary Stateful Streaming Aggregation <small>(2 of 2)</small>

```scala
KeyValueGroupedDataset.flatMapGroupsWithState[S: Encoder, U: Encoder](
  outputMode: OutputMode,
  timeoutConf: GroupStateTimeout)(
  func: (K, Iterator[V], GroupState[S]) => Iterator[U]): Dataset[U]
```
              1. **GroupState** is a per-group state data
              1. Every time the state function func is executed for a **K** key, the state (as **GroupState[S]**) is for this key only
              1. **GroupStateTimeout** allows for a timeout based on processing time, event-time or no timeout at all
              1. Requires **Append** or **Update** output modes
            </textarea>
          </section>
        </section>

        <section id="state-store" data-markdown style="font-size: 80%">
          <textarea data-template>
            ## State Store

            1. Structured Streaming uses versioned (and possibly fault-tolerant) **key-value state stores** for streaming state data
            1. State stores are on executors in **state** directory under **checkpointLocation**
            1. Supports **incremental checkpointing** - only the key-value "Row" pairs that changed can be committed or aborted (other pairs are not touched)
            1. Every state store is **identified** (e.g. aggregating operator id, the partition id)
            1. **HDFSBackedStateStore** is the default (and only) state store
              * Uses a HDFS-compatible file system for versioned state persistence
            1. [spark-state-tools](https://github.com/HeartSaVioR/spark-state-tools) for offline manipulation of state stores
          </textarea>
        </section>

        <section id="recap" style="font-size: 90%" data-markdown>
          <textarea data-template>
            ## Recap

            1. [Structured Streaming](#/intro)
            1. [Stateful Stream Processing](#/stateful-stream-processing)
            1. [Streaming Aggregation](#/streaming-aggregation)
            1. [Streaming Watermark](#/streaming-watermark)
            1. [Streaming Join](#/streaming-join)
            1. [Arbitrary Stateful Streaming Aggregation](#/arbitrary-stateful-streaming-aggregation)
            1. [State Store](#/state-store)
          </textarea>
        </section>

        <section style="text-align: left" data-markdown id="questions">
          <textarea data-template>
            # Questions?

            * Read [The Internals of Apache Spark](https://bit.ly/apache-spark-internals)
            * Read [The Internals of Spark SQL](https://bit.ly/spark-sql-internals)
            * Read [The Internals of Spark Structured Streaming](https://bit.ly/spark-structured-streaming)
            * Follow [@jaceklaskowski](https://twitter.com/jaceklaskowski) on twitter (DMs open)
            * Upvote [my questions and answers on StackOverflow](http://stackoverflow.com/users/1305344/jacek-laskowski)
            * Contact me at **jacek@japila.pl**
          </textarea>
        </section>

      </div>
    </div>

    <script src="reveal.js/lib/js/head.min.js"></script>
    <script src="reveal.js/js/reveal.js"></script>

    <script>
      // More info about config & dependencies:
      // - https://github.com/hakimel/reveal.js#configuration
      // - https://github.com/hakimel/reveal.js#dependencies
      Reveal.initialize({
        controls: true,
        progress: true,
        history: true,
        center: true,
        slideNumber: true,

        transition: 'slide', // none/fade/slide/convex/concave/zoom

        menu: {
          markers: true,
          openSlideNumber: true
        },
        dependencies: [
          { src: 'reveal.js/lib/js/classList.js', condition: function () { return !document.body.classList; } },
          { src: 'reveal.js/plugin/markdown/marked.js' },
          { src: 'reveal.js/plugin/markdown/markdown.js' },
          { src: 'reveal.js/plugin/zoom-js/zoom.js', async: true },
          { src: 'reveal.js/plugin/notes/notes.js', async: true },
          { src: 'reveal.js/plugin/highlight/highlight.js', async: true, callback: function () { hljs.initHighlightingOnLoad(); } }
        ]
      });
    </script>
    <script>
      (function (i, s, o, g, r, a, m) {
        i['GoogleAnalyticsObject'] = r; i[r] = i[r] || function () {
          (i[r].q = i[r].q || []).push(arguments)
        }, i[r].l = 1 * new Date(); a = s.createElement(o),
          m = s.getElementsByTagName(o)[0]; a.async = 1; a.src = g; m.parentNode.insertBefore(a, m)
      })(window, document, 'script', '//www.google-analytics.com/analytics.js', 'ga');

      ga('create', 'UA-45999426-3', 'auto');
      ga('send', 'pageview');

    </script>
  </body>
</html>