diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 0000000..1230149 --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,6 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "daily" diff --git a/.github/workflows/check-dependencies-updates.yml b/.github/workflows/check-dependencies-updates.yml new file mode 100644 index 0000000..f43f8a4 --- /dev/null +++ b/.github/workflows/check-dependencies-updates.yml @@ -0,0 +1,16 @@ +on: + schedule: + - cron: '0 6 * * 1-5' + +name: 🍄 Check dependencies updates + +permissions: + contents: write + pull-requests: write + +jobs: + scala-steward: + runs-on: ubuntu-22.04 + name: Check Scala project dependencies updates with Scala Steward + steps: + - uses: scala-steward-org/scala-steward-action@v2 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..f68e0e6 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,25 @@ +name: CI + +on: + push: + branches: + - main + pull_request: + +permissions: + contents: read + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-java@v4 + with: + distribution: 'zulu' + java-version: '21' + cache: 'sbt' + - name: 👌 Run "pre-push" tasks (compile and style-check) + run: sbt prep + - name: ✅ Run test + run: sbt test diff --git a/.github/workflows/update-github-dependency-graph.yml b/.github/workflows/update-github-dependency-graph.yml new file mode 100644 index 0000000..bb88b08 --- /dev/null +++ b/.github/workflows/update-github-dependency-graph.yml @@ -0,0 +1,16 @@ +name: Update GitHub Dependency Graph + +on: + push: + branches: + - main + +permissions: + contents: write + +jobs: + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: scalacenter/sbt-dependency-submission@v3 diff --git a/.scalafmt.conf b/.scalafmt.conf index ac535ae..660cd8b 100644 --- a/.scalafmt.conf +++ b/.scalafmt.conf @@ -1,4 +1,11 @@ -version=2.5.2 +version = 3.8.2 +runner.dialect = scala213 +style = default +maxColumn = 120 +continuationIndent.callSite = 2 align.preset = more -maxColumn = 80 -importSelectors = singleLine \ No newline at end of file +runner.optimizer.forceConfigStyleMinArgCount = 1 +rewrite.rules = [SortImports] +importSelectors = singleLine +project.excludeFilters = ["target/"] +project.git = true # Only format files tracked by git diff --git a/build.sbt b/build.sbt index 0b2fb73..ca895e8 100644 --- a/build.sbt +++ b/build.sbt @@ -1,38 +1,8 @@ -scalaVersion := "2.12.12" -version := "0.1.0-SNAPSHOT" -name := "spark-for-programmers-course" -organization := "com.codely" +Settings.settings -val sparkVesion = "3.5.0" +libraryDependencies := Dependencies.all -libraryDependencies ++= Seq( - "org.apache.spark" %% "spark-core" % sparkVesion, - "org.apache.spark" %% "spark-sql" % sparkVesion, - "org.apache.spark" %% "spark-hive" % sparkVesion, - "org.apache.spark" %% "spark-streaming" % sparkVesion, - "org.apache.spark" %% "spark-sql-kafka-0-10" % sparkVesion, - "io.delta" %% "delta-spark" % "3.1.0", - // "com.amazonaws" % "aws-java-sdk-bundle" % "1.11.375", - "org.apache.hadoop" % "hadoop-aws" % "3.2.2", - "com.rabbitmq" % "amqp-client" % "5.12.0", - "com.typesafe" % "config" % "1.4.1", - //"org.apache.hadoop" % "hadoop-common" % "3.3.1", - "org.scalatest" %% "scalatest" % "3.2.18" % Test, - "org.scalatest" %% "scalatest-flatspec" % "3.2.18" % Test, - "com.dimafeng" %% "testcontainers-scala" % "0.40.12" % Test, - "com.dimafeng" %% "testcontainers-scala-kafka" % "0.40.12" % Test, - "com.dimafeng" %% "testcontainers-scala-postgresql" % "0.41.4" % Test, - "org.postgresql" % "postgresql" % "9.4.1207" % Test, - "org.mockito" %% "mockito-scala" % "1.16.42" % Test -) - -assembly / mainClass := Some( - "com.codely.lesson_07_spark_optimize_and_monitoring.video_01__deploy_application.DeploySparkApp" -) - -assembly / assemblyMergeStrategy := { - case PathList("META-INF", xs @ _*) => MergeStrategy.discard - case PathList("org", "apache", "spark", "unused", "UnusedStubClass.class") => - MergeStrategy.first - case _ => MergeStrategy.first +SbtAliases.aliases.flatMap { + case (alias, command) => + addCommandAlias(alias, command) } diff --git a/doc/hooks/install-hooks.sh b/doc/hooks/install-hooks.sh new file mode 100755 index 0000000..c50abc3 --- /dev/null +++ b/doc/hooks/install-hooks.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +cd "$(dirname "$0")/../.." + +rm -rf .git/hooks + +ln -s ../doc/hooks .git/hooks +sudo chmod -R 777 doc/hooks/* diff --git a/doc/hooks/pre-push b/doc/hooks/pre-push new file mode 100755 index 0000000..3d6953e --- /dev/null +++ b/doc/hooks/pre-push @@ -0,0 +1,50 @@ +#!/bin/bash + +# Checks if locally staged changes are formatted properly ignoring non-staged changes. +# Install it with the `install-hooks.sh` script +# Based on: https://gist.github.com/cvogt/2676ed6c6d1abafa3d6a + +PATH=$PATH:/usr/local/bin:/usr/local/sbin + +echo "" +echo "Running pre-push hook… (you can omit this with --no-verify, but don't)" + +echo "* Moving to the project directory…" +_DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) +DIR=$( echo $_DIR | sed 's/\/.git\/hooks$//' ) + +echo "* Stashing non-staged changes so we avoid checking them…" +git diff --quiet +hadNoNonStagedChanges=$? + +if ! [ $hadNoNonStagedChanges -eq 0 ] +then + git stash --keep-index -u > /dev/null +fi + +echo "* Checking pre push conditions ('prep' SBT task)…" +sbt prep > /dev/null +canPush=$? + +if [ $canPush -ne 0 ] +then + echo " [KO] Error :(" +fi + +echo "* Applying the stash with the non-staged changes…" +if ! [ $hadNoNonStagedChanges -eq 0 ] +then + sleep 1 && git stash pop --index > /dev/null & # sleep because otherwise commit fails when this leads to a merge conflict +fi + +# Final result +echo "" + +if [ $canPush -eq 0 ] +then + echo "[OK] Your code will be pushed young Padawan" + exit 0 +else + echo "[KO] Cancelling push due to test code style error (run 'sbt prep' for more information)" + exit 1 +fi diff --git a/project/Dependencies.scala b/project/Dependencies.scala new file mode 100644 index 0000000..2a3adbb --- /dev/null +++ b/project/Dependencies.scala @@ -0,0 +1,20 @@ +import sbt._ + +object Dependencies { + private val prod = Seq( + "com.github.nscala-time" %% "nscala-time" % "2.32.0", + "com.lihaoyi" %% "pprint" % "0.9.0", + "org.apache.spark" %% "spark-core" % "3.5.0" % Provided, + "org.apache.spark" %% "spark-sql" % "3.5.0" % Provided, + "org.apache.spark" %% "spark-streaming" % "3.5.0", + "org.apache.spark" %% "spark-hive" % "3.5.0", + "io.delta" %% "delta-spark" % "3.1.0", + "org.apache.hadoop" % "hadoop-aws" % "3.2.2" + ) + private val test = Seq( + "org.scalatest" %% "scalatest" % "3.2.19", + "org.mockito" %% "mockito-scala" % "1.16.42" + ).map(_ % Test) + + val all: Seq[ModuleID] = prod ++ test +} diff --git a/project/SbtAliases.scala b/project/SbtAliases.scala new file mode 100644 index 0000000..8e0a9f7 --- /dev/null +++ b/project/SbtAliases.scala @@ -0,0 +1,15 @@ +object SbtAliases { + val aliases: Seq[(String, String)] = Seq( + "t" -> "test", + "to" -> "testOnly", + "tq" -> "testQuick", + "tsf" -> "testShowFailed", + "c" -> "compile", + "tc" -> "Test / compile", + "f" -> "scalafmt", // Format production files according to ScalaFmt + "fc" -> "scalafmtCheck", // Check if production files are formatted according to ScalaFmt + "tf" -> "Test / scalafmt", // Format test files according to ScalaFmt + "tfc" -> "Test / scalafmtCheck", // Check if test files are formatted according to ScalaFmt + "prep" -> ";c;tc;fc;tfc" // All the needed tasks before pushing to the repository (compile, compile test, format check in prod and test) + ) +} diff --git a/project/Settings.scala b/project/Settings.scala new file mode 100644 index 0000000..00c2768 --- /dev/null +++ b/project/Settings.scala @@ -0,0 +1,44 @@ +import sbt.Keys._ +import sbt.io.syntax._ +import sbt.{Compile, Test, TestFrameworks, Tests, Configuration => _} + +object Settings { + val settings = Seq( + name := "spark-for-devs-course", + version := "0.1.0-SNAPSHOT", + scalaVersion := "2.12.12", + organization := "com.codely", + organizationName := "com.codely, Inc.", + organizationHomepage := Some(url("https://com.codely")), + // Custom folders path (remove the `/scala` default subdirectory) + Compile / scalaSource := file( + (baseDirectory.value / "src" / "main").toString + ), + Test / scalaSource := file((baseDirectory.value / "src" / "test").toString), + // Compiler options + scalacOptions ++= Seq( + "-deprecation", // Warnings deprecation + "-feature", // Advise features + "-unchecked", // More warnings. Strict + "-Xlint", // More warnings when compiling + "-Ywarn-dead-code", + "-Ywarn-unused" + ), + Test / scalacOptions += "-Xcheckinit", // Check against early initialization only in tests because it's expensive + javaOptions += "-Duser.timezone=UTC", + // Test options + Test / parallelExecution := false, + Test / testForkedParallel := false, + Test / fork := true, + Test / testOptions ++= Seq( + Tests.Argument( + TestFrameworks.ScalaTest, + "-u", + "target/test-reports" + ), // Save test reports + Tests.Argument( + "-oDF" + ) // Show full stack traces and time spent in each test + ) + ) +} diff --git a/project/build.properties b/project/build.properties index 0aa5c39..136f452 100644 --- a/project/build.properties +++ b/project/build.properties @@ -1 +1 @@ -sbt.version = 1.9.8 +sbt.version = 1.10.1 diff --git a/project/plugins.sbt b/project/plugins.sbt index b256954..7d517ef 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,2 +1 @@ -addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "1.2.0") addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.5.2") diff --git a/src/main/scala/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/FromCSVToSQL.scala b/src/main/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/FromCSVToSQL.scala similarity index 93% rename from src/main/scala/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/FromCSVToSQL.scala rename to src/main/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/FromCSVToSQL.scala index 71ed912..7633ab2 100644 --- a/src/main/scala/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/FromCSVToSQL.scala +++ b/src/main/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/FromCSVToSQL.scala @@ -13,7 +13,7 @@ object FromCSVToSQL extends App { .getOrCreate() val pathNetflixFile = - "src/main/scala/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/data/netflix_titles.csv" + "src/main/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/data/netflix_titles.csv" spark.read .csv(pathNetflixFile) diff --git a/src/main/scala/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/data/netflix_titles.csv b/src/main/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/data/netflix_titles.csv similarity index 100% rename from src/main/scala/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/data/netflix_titles.csv rename to src/main/com/codely/lesson_01__discover_apache_spark/video_01__from_excel_to_sql/data/netflix_titles.csv diff --git a/src/main/scala/com/codely/lesson_01__discover_apache_spark/video_02__trafffic_bottleneck_detection/TrafficBottleneckDetection.scala b/src/main/com/codely/lesson_01__discover_apache_spark/video_02__trafffic_bottleneck_detection/TrafficBottleneckDetection.scala similarity index 100% rename from src/main/scala/com/codely/lesson_01__discover_apache_spark/video_02__trafffic_bottleneck_detection/TrafficBottleneckDetection.scala rename to src/main/com/codely/lesson_01__discover_apache_spark/video_02__trafffic_bottleneck_detection/TrafficBottleneckDetection.scala diff --git a/src/main/scala/com/codely/lesson_01__discover_apache_spark/video_02__trafffic_bottleneck_detection/TrafficDataGenerator.scala b/src/main/com/codely/lesson_01__discover_apache_spark/video_02__trafffic_bottleneck_detection/TrafficDataGenerator.scala similarity index 100% rename from src/main/scala/com/codely/lesson_01__discover_apache_spark/video_02__trafffic_bottleneck_detection/TrafficDataGenerator.scala rename to src/main/com/codely/lesson_01__discover_apache_spark/video_02__trafffic_bottleneck_detection/TrafficDataGenerator.scala diff --git a/src/main/scala/com/codely/lesson_01__discover_apache_spark/video_03__intro_domain_events_analysis/HighPriceProductsPurchased.scala b/src/main/com/codely/lesson_01__discover_apache_spark/video_04__intro_domain_events_analysis/HighPriceProductsPurchased.scala similarity index 92% rename from src/main/scala/com/codely/lesson_01__discover_apache_spark/video_03__intro_domain_events_analysis/HighPriceProductsPurchased.scala rename to src/main/com/codely/lesson_01__discover_apache_spark/video_04__intro_domain_events_analysis/HighPriceProductsPurchased.scala index 39be1c0..53a186c 100644 --- a/src/main/scala/com/codely/lesson_01__discover_apache_spark/video_03__intro_domain_events_analysis/HighPriceProductsPurchased.scala +++ b/src/main/com/codely/lesson_01__discover_apache_spark/video_04__intro_domain_events_analysis/HighPriceProductsPurchased.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_01__discover_apache_spark.video_03__intro_domain_events_analysis +package com.codely.lesson_01__discover_apache_spark.video_04__intro_domain_events_analysis import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.functions.{col, desc, explode, lit, month} @@ -15,7 +15,7 @@ private object HighPriceProductsPurchased extends App { spark.sparkContext.setLogLevel("WARN") val purchasedCompletedFilePath = - "src/main/scala/com/codely/lesson_01__discover_apache_spark/video_03__intro_domain_events_analysis/data/purchasecompleted.json" + "src/main/com/codely/lesson_01__discover_apache_spark/video_04__intro_domain_events_analysis/data/purchasecompleted.json" spark.read .format("json") diff --git a/src/main/scala/com/codely/lesson_01__discover_apache_spark/video_03__intro_domain_events_analysis/data/purchasecompleted.json b/src/main/com/codely/lesson_01__discover_apache_spark/video_04__intro_domain_events_analysis/data/purchasecompleted.json similarity index 100% rename from src/main/scala/com/codely/lesson_01__discover_apache_spark/video_03__intro_domain_events_analysis/data/purchasecompleted.json rename to src/main/com/codely/lesson_01__discover_apache_spark/video_04__intro_domain_events_analysis/data/purchasecompleted.json diff --git a/src/main/scala/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/PlatformAccessAnalysis.scala b/src/main/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/PlatformAccessAnalysis.scala similarity index 94% rename from src/main/scala/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/PlatformAccessAnalysis.scala rename to src/main/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/PlatformAccessAnalysis.scala index 8591a35..db8b1be 100644 --- a/src/main/scala/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/PlatformAccessAnalysis.scala +++ b/src/main/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/PlatformAccessAnalysis.scala @@ -17,7 +17,7 @@ object PlatformAccessAnalysis extends App { // 2. Read data val accessEventFilePath = - "src/main/scala/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/data/accessevent.json" + "src/main/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/data/accessevent.json" val accessEventDF = spark.read.json(accessEventFilePath) accessEventDF.show() diff --git a/src/main/scala/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/data/accessevent.json b/src/main/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/data/accessevent.json similarity index 100% rename from src/main/scala/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/data/accessevent.json rename to src/main/com/codely/lesson_01__discover_apache_spark/z_practical_exercise/data/accessevent.json diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/CartActivityAnalysis.scala b/src/main/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/CartActivityAnalysis.scala similarity index 75% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/CartActivityAnalysis.scala rename to src/main/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/CartActivityAnalysis.scala index 364579f..784994c 100644 --- a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/CartActivityAnalysis.scala +++ b/src/main/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/CartActivityAnalysis.scala @@ -14,17 +14,17 @@ object CartActivityAnalysis extends App { val addedToCartDF = spark.read.json( - "src/main/scala/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/data/addedToCart.json" + "src/main/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/data/addedToCart.json" ) addedToCartDF .select(col("userId"), col("timestamp"), col("products")) + addedToCartDF.select("userId", "timestamp", "products") + addedToCartDF.selectExpr("userId", "timestamp", "products") import spark.implicits._ addedToCartDF.select($"userId", $"timestamp", $"products") addedToCartDF.select('userId, 'timestamp, 'products) - addedToCartDF.select("userId", "timestamp", "products") - addedToCartDF.selectExpr("userId", "timestamp", "products") addedToCartDF.filter(size(col("products")) === 1) addedToCartDF.filter("size(products) == 1") @@ -71,20 +71,22 @@ object CartActivityAnalysis extends App { expr("(products[0].quantity * products[0].price) as Total") ) - addedToCartDF - .filter("size(products) == 1") - .select( - col("timestamp").as("EventPublished"), - col("userId"), - col("products"), - expr( - "(products[0].quantity * products[0].price) as Total" + val onlyOneProductAddedToCartDF = + addedToCartDF + .filter("size(products) == 1") + .select( + col("timestamp").as("EventPublished"), + col("userId"), + col("products"), + expr( + "(products[0].quantity * products[0].price) as Total" + ) ) - ) - .withColumn( - "date", - to_date(col("EventPublished"), "yyyy-MM-dd'T'HH:mm:ss'Z'") - ) - .drop("EventPublished") - .show(false) + .withColumn( + "date", + to_date(col("EventPublished"), "yyyy-MM-dd'T'HH:mm:ss'Z'") + ) + .drop("EventPublished") + + onlyOneProductAddedToCartDF.show(false) } diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/data/addedToCart.json b/src/main/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/data/addedToCart.json similarity index 100% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/data/addedToCart.json rename to src/main/com/codely/lesson_02__analyze_domain_events/video_01__analysis_products_added_to_cart/data/addedToCart.json diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/UserActivityAnalysis.scala b/src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/UserActivityAnalysis.scala similarity index 79% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/UserActivityAnalysis.scala rename to src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/UserActivityAnalysis.scala index 46a08c2..638d574 100644 --- a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/UserActivityAnalysis.scala +++ b/src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/UserActivityAnalysis.scala @@ -16,20 +16,20 @@ object UserActivityAnalysis extends App { private def readJson(path: String): DataFrame = spark.read.json(path) val viewedDF = readJson( - "src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/productViewed.json" + "src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/productViewed.json" ) val addedToCartDF = readJson( - "src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/addedToCart.json" + "src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/addedToCart.json" ) - /* It fails due to the fact that the columns are not in the same order + /* viewedDF .union(addedToCartDF) .show(false) */ - /* It fails due to the fact there are missing columns + /* viewedDF .unionByName(addedToCartDF) .show(false) diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/addedToCart.json b/src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/addedToCart.json similarity index 100% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/addedToCart.json rename to src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/addedToCart.json diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/productViewed.json b/src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/productViewed.json similarity index 100% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/productViewed.json rename to src/main/com/codely/lesson_02__analyze_domain_events/video_02_user_activity_analysis/data/productViewed.json diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/AnalysisUserJourney.scala b/src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/AnalysisUserJourney.scala similarity index 82% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/AnalysisUserJourney.scala rename to src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/AnalysisUserJourney.scala index 604ba7e..eea3ce9 100644 --- a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/AnalysisUserJourney.scala +++ b/src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/AnalysisUserJourney.scala @@ -1,6 +1,6 @@ package com.codely.lesson_02__analyze_domain_events.video_03_analysis_user_journey -import org.apache.spark.sql.{DataFrame, SparkSession} +import org.apache.spark.sql.{Column, DataFrame, SparkSession} import org.apache.spark.sql.functions.{avg, col, collect_list, countDistinct, explode, month, sort_array, struct, to_date, year} object AnalysisUserJourney extends App { @@ -15,15 +15,15 @@ object AnalysisUserJourney extends App { private def readJson(path: String): DataFrame = spark.read.json(path) val productViewedDF = readJson( - "src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/productViewed.json" + "src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/productViewed.json" ) val productAddedDF = readJson( - "src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/addedToCart.json" + "src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/addedToCart.json" ) val productPurchasedDF = readJson( - "src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/purchasecompleted.json" + "src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/purchasecompleted.json" ) import spark.implicits._ @@ -77,12 +77,14 @@ object AnalysisUserJourney extends App { popularProductsDF.show(false) + private val userJourneyItem: Column = struct("timestamp", "eventType", "productId") + val userJourneyDF = allEventsDF .withColumn("date", to_date(col("timestamp"))) .groupBy("userId", "date") .agg( sort_array( - collect_list(struct("timestamp", "eventType", "productId")), + collect_list(userJourneyItem), asc = true ).as("UserJourney") ) diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/addedToCart.json b/src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/addedToCart.json similarity index 100% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/addedToCart.json rename to src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/addedToCart.json diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/productViewed.json b/src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/productViewed.json similarity index 100% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/productViewed.json rename to src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/productViewed.json diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/purchasecompleted.json b/src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/purchasecompleted.json similarity index 100% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/purchasecompleted.json rename to src/main/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/purchasecompleted.json diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/UserSessionAnalysis.scala b/src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/UserSessionAnalysis.scala similarity index 91% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/UserSessionAnalysis.scala rename to src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/UserSessionAnalysis.scala index 8b40220..24073f7 100644 --- a/src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/UserSessionAnalysis.scala +++ b/src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/UserSessionAnalysis.scala @@ -17,9 +17,9 @@ object UserSessionAnalysis extends App { // 2. val sessionsFilePath = - "src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/sessions.csv" + "src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/sessions.csv" val eventsFilePath = - "src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/events.txt" + "src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/events.txt" val sessionsSchema = StructType( Array( diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/events.txt b/src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/events.txt similarity index 100% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/events.txt rename to src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/events.txt diff --git a/src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/sessions.csv b/src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/sessions.csv similarity index 100% rename from src/main/scala/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/sessions.csv rename to src/main/com/codely/lesson_02__analyze_domain_events/z_practical_exercise/data/sessions.csv diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/CacheInDataFrames.scala b/src/main/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/CacheInDataFrames.scala similarity index 83% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/CacheInDataFrames.scala rename to src/main/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/CacheInDataFrames.scala index 9baa7e2..89336c3 100644 --- a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/CacheInDataFrames.scala +++ b/src/main/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/CacheInDataFrames.scala @@ -20,7 +20,7 @@ object CacheInDataFrames extends App { spark.sparkContext.setJobGroup("GroupID_1", "Read Data") val productPurchasedDF = readJson( - "src/main/scala/com/codely/lesson_02__analyze_domain_events/video_03_analysis_user_journey/data/purchasecompleted.json" + "src/main/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/data/purchasecompleted.json" ) spark.sparkContext.clearJobGroup() @@ -28,7 +28,9 @@ object CacheInDataFrames extends App { spark.sparkContext.setJobGroup("GroupID_2", "Selecting Data") val selectedDataFrame = - productPurchasedDF.select("userId", "products").cache() + productPurchasedDF + .select("userId", "products") + .persist(StorageLevel.MEMORY_AND_DISK) selectedDataFrame.show() @@ -43,6 +45,7 @@ object CacheInDataFrames extends App { Thread.sleep(1000000) spark.sparkContext.clearJobGroup() + selectedDataFrame.unpersist() } diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/data/purchasecompleted.json b/src/main/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/data/purchasecompleted.json similarity index 100% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/data/purchasecompleted.json rename to src/main/com/codely/lesson_03__cache_and_joins/video_01__cache_in_dataframes/data/purchasecompleted.json diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/JoinDataFrames.scala b/src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/JoinDataFrames.scala similarity index 89% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/JoinDataFrames.scala rename to src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/JoinDataFrames.scala index aa6d9b2..11ce6b7 100644 --- a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/JoinDataFrames.scala +++ b/src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/JoinDataFrames.scala @@ -15,15 +15,15 @@ private object JoinDataFrames extends App { private def readJson(path: String): DataFrame = spark.read.json(path) val productViewedDF = readJson( - "src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/productViewed.json" + "src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/productViewed.json" ) val productAddedDF = readJson( - "src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/addedToCart.json" + "src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/addedToCart.json" ) val productPurchasedDF = readJson( - "src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/purchasecompleted.json" + "src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/purchasecompleted.json" ) import spark.implicits._ @@ -96,7 +96,7 @@ private object JoinDataFrames extends App { viewedNotAddedDF.show() - val allEventsDF = productViewedDFlatDF + val userJourneyDataframe = productViewedDFlatDF .join( productAddedFlatDF, productViewedDFlatDF("userId") === productAddedFlatDF( @@ -117,6 +117,6 @@ private object JoinDataFrames extends App { productAddedFlatDF("timestamp").as("addedTime") ) - allEventsDF.show(100) + userJourneyDataframe.show(100) } diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/addedToCart.json b/src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/addedToCart.json similarity index 100% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/addedToCart.json rename to src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/addedToCart.json diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/productViewed.json b/src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/productViewed.json similarity index 100% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/productViewed.json rename to src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/productViewed.json diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/purchasecompleted.json b/src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/purchasecompleted.json similarity index 100% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/video_02__joins/data/purchasecompleted.json rename to src/main/com/codely/lesson_03__cache_and_joins/video_02__joins/data/purchasecompleted.json diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_03__optimizations/Optimizations.scala b/src/main/com/codely/lesson_03__cache_and_joins/video_03__catalyst_optimizations/Optimizations.scala similarity index 87% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/video_03__optimizations/Optimizations.scala rename to src/main/com/codely/lesson_03__cache_and_joins/video_03__catalyst_optimizations/Optimizations.scala index 9977981..f228dd0 100644 --- a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_03__optimizations/Optimizations.scala +++ b/src/main/com/codely/lesson_03__cache_and_joins/video_03__catalyst_optimizations/Optimizations.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_03__cache_and_joins.video_03__optimizations +package com.codely.lesson_03__cache_and_joins.video_03__catalyst_optimizations import org.apache.spark.rdd.RDD import org.apache.spark.sql.functions.col @@ -38,7 +38,7 @@ object Optimizations extends App { val productViewedDF = spark.read .json( - "src/main/scala/com/codely/lesson_03__cache_and_joins/video_03__optimizations/data/productViewed.json" + "src/main/com/codely/lesson_03__cache_and_joins/video_03__catalyst_optimizations/data/productViewed.json" ) case class ProductViewed( diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/video_03__optimizations/data/productViewed.json b/src/main/com/codely/lesson_03__cache_and_joins/video_03__catalyst_optimizations/data/productViewed.json similarity index 100% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/video_03__optimizations/data/productViewed.json rename to src/main/com/codely/lesson_03__cache_and_joins/video_03__catalyst_optimizations/data/productViewed.json diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/JoinsExercise.scala b/src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/JoinsExercise.scala similarity index 87% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/JoinsExercise.scala rename to src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/JoinsExercise.scala index 86b9494..809015c 100644 --- a/src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/JoinsExercise.scala +++ b/src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/JoinsExercise.scala @@ -16,9 +16,9 @@ object JoinsExercise extends App { // 2. val usersFilePath = - "src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/users.csv" + "src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/users.csv" val transactionsFilePath = - "src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/transactions.csv" + "src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/transactions.csv" val usersDF = spark.read .option("header", "true") diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/transactions.csv b/src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/transactions.csv similarity index 100% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/transactions.csv rename to src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/transactions.csv diff --git a/src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/users.csv b/src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/users.csv similarity index 100% rename from src/main/scala/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/users.csv rename to src/main/com/codely/lesson_03__cache_and_joins/z_practical_exercise/data/users.csv diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_01__introduction_to_persistence/WritingDataFrames.scala b/src/main/com/codely/lesson_04__persistence/video_01__introduction_to_persistence/WritingDataFrames.scala similarity index 68% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_01__introduction_to_persistence/WritingDataFrames.scala rename to src/main/com/codely/lesson_04__persistence/video_01__introduction_to_persistence/WritingDataFrames.scala index cce12d5..7014ccd 100644 --- a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_01__introduction_to_persistence/WritingDataFrames.scala +++ b/src/main/com/codely/lesson_04__persistence/video_01__introduction_to_persistence/WritingDataFrames.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_04__build_your_lakehouse.video_01__introduction_to_persistence +package com.codely.lesson_04__persistence.video_01__introduction_to_persistence import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.functions.{avg, explode} @@ -6,20 +6,17 @@ import org.apache.spark.sql.functions.{avg, explode} object WritingDataFrames extends App { val spark: SparkSession = SparkSession .builder() - .master("local") + .master("local[*]") .getOrCreate() spark.sparkContext.setLogLevel("WARN") + import spark.implicits._ val productPurchasedFilePath = - "src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_01__introduction_to_persistence/data/purchasecompleted.json" + "src/main/com/codely/lesson_04__persistence/video_01__introduction_to_persistence/data/purchasecompleted.json" val productPurchasedDF = spark.read.json(productPurchasedFilePath) - spark.sparkContext.setLogLevel("WARN") - - import spark.implicits._ - val avgSpendingPerUserDF = productPurchasedDF .select($"userId", explode($"products").as("product")) .select( @@ -31,15 +28,15 @@ object WritingDataFrames extends App { .orderBy($"AvgSpending".desc) val outputBasePath = - "src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_01__introduction_to_persistence/output/" + "src/main/com/codely/lesson_04__persistence/video_01__introduction_to_persistence/output/" - avgSpendingPerUserDF.write - .save(outputBasePath) + /* avgSpendingPerUserDF.write + .save(outputBasePath)*/ - avgSpendingPerUserDF.write + /* avgSpendingPerUserDF.write .format("CSV") .option("header", "true") - .save(s"$outputBasePath/csv/") + .save(s"$outputBasePath/csv/")*/ val newData = Seq( (1, 99), @@ -49,7 +46,7 @@ object WritingDataFrames extends App { /* newData.write .format("csv") .option("header", "true") - .save(s"$outputBasePath/csv/")*/ // This will fail + .save(s"$outputBasePath/csv/")*/ newData.write .format("csv") diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_01__introduction_to_persistence/data/purchasecompleted.json b/src/main/com/codely/lesson_04__persistence/video_01__introduction_to_persistence/data/purchasecompleted.json similarity index 100% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_01__introduction_to_persistence/data/purchasecompleted.json rename to src/main/com/codely/lesson_04__persistence/video_01__introduction_to_persistence/data/purchasecompleted.json diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_02__partitioning_data/PartitioningData.scala b/src/main/com/codely/lesson_04__persistence/video_02__partitioning_data/PartitioningData.scala similarity index 84% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_02__partitioning_data/PartitioningData.scala rename to src/main/com/codely/lesson_04__persistence/video_02__partitioning_data/PartitioningData.scala index 33fbcbc..a26b9e6 100644 --- a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_02__partitioning_data/PartitioningData.scala +++ b/src/main/com/codely/lesson_04__persistence/video_02__partitioning_data/PartitioningData.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_04__build_your_lakehouse.video_02__partitioning_data +package com.codely.lesson_04__persistence.video_02__partitioning_data import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{avg, explode, month, to_date} @@ -10,14 +10,13 @@ object PartitioningData extends App { .getOrCreate() spark.sparkContext.setLogLevel("WARN") + import spark.implicits._ val productPurchasedFilePath = - "src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_02__partitioning_data/data/purchasecompleted.json" + "src/main/com/codely/lesson_04__persistence/video_02__partitioning_data/data/purchasecompleted.json" val productPurchasedDF = spark.read.json(productPurchasedFilePath) - import spark.implicits._ - val avgSpendingPerUserDF = productPurchasedDF .withColumn("date", to_date($"timestamp", "yyyy-MM-dd'T'HH:mm:ss'Z'")) .select($"userId", explode($"products").as("product"), $"date") @@ -32,7 +31,7 @@ object PartitioningData extends App { .orderBy($"userId", $"category", $"month") val outputBasePath = - "src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_02__partitioning_data/data/output/" + "src/main/com/codely/lesson_04__persistence/video_02__partitioning_data/output/" avgSpendingPerUserDF.write .mode("overwrite") @@ -49,14 +48,18 @@ object PartitioningData extends App { .load(s"$outputBasePath/noPartitioned/") .filter($"category" === "Electronics" && $"month" === 1) .show() + spark.sparkContext.clearJobGroup() spark.sparkContext.setJobGroup("GroupID_1", "Read Data partitioned") + spark.read .load(s"$outputBasePath/partitioned/") .filter($"category" === "Electronics" && $"month" === 1) .show() + spark.sparkContext.clearJobGroup() + Thread.sleep(1000000) } diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_02__partitioning_data/data/purchasecompleted.json b/src/main/com/codely/lesson_04__persistence/video_02__partitioning_data/data/purchasecompleted.json similarity index 100% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_02__partitioning_data/data/purchasecompleted.json rename to src/main/com/codely/lesson_04__persistence/video_02__partitioning_data/data/purchasecompleted.json diff --git a/src/main/com/codely/lesson_04__persistence/z_practical_exercise/WebAccessAnalysis.scala b/src/main/com/codely/lesson_04__persistence/z_practical_exercise/WebAccessAnalysis.scala new file mode 100644 index 0000000..c030b0b --- /dev/null +++ b/src/main/com/codely/lesson_04__persistence/z_practical_exercise/WebAccessAnalysis.scala @@ -0,0 +1,67 @@ +package com.codely.lesson_04__persistence.z_practical_exercise + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.SaveMode + +object WebAccessAnalysis extends App { + + val spark = SparkSession.builder + .appName("Web Access Analysis") + .config("spark.master", "local") + .getOrCreate() + + // 1. + val accessLogDF = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv( + "src/main/com/codely/lesson_04__persistence/z_practical_exercise/data/access_logs.csv" + ) + + accessLogDF.printSchema() + accessLogDF.show() + + val accessLogWithDateDF = + accessLogDF.withColumn("date", to_date(col("timestamp"))) + + // 2. + val outputBasePath = + "src/main/com/codely/lesson_04__persistence/z_practical_exercise/output/" + + accessLogWithDateDF.write + .mode(SaveMode.Overwrite) + .parquet(s"$outputBasePath/access_logs_parquet_overwrite") + + // 4. + accessLogWithDateDF.write + .mode(SaveMode.Overwrite) + .saveAsTable("web_access_logs") + + // 5. + accessLogWithDateDF.write + .partitionBy("date", "user_id") + .mode(SaveMode.Overwrite) + .parquet(s"$outputBasePath/access_logs_partitioned") + + // 6. + val nonPartitionedDF = + spark.read.parquet(s"$outputBasePath/access_logs_parquet") + val partitionedDF = + spark.read.parquet(s"$outputBasePath/access_logs_partitioned") + + nonPartitionedDF.createOrReplaceTempView("non_partitioned") + partitionedDF.createOrReplaceTempView("partitioned") + + spark + .sql( + "SELECT user_id, COUNT(page_url) as pages_visited FROM non_partitioned GROUP BY user_id" + ) + .show() + spark + .sql( + "SELECT user_id, COUNT(page_url) as pages_visited FROM partitioned GROUP BY user_id" + ) + .show() + +} diff --git a/src/main/com/codely/lesson_04__persistence/z_practical_exercise/data/access_logs.csv b/src/main/com/codely/lesson_04__persistence/z_practical_exercise/data/access_logs.csv new file mode 100644 index 0000000..6a4d9c8 --- /dev/null +++ b/src/main/com/codely/lesson_04__persistence/z_practical_exercise/data/access_logs.csv @@ -0,0 +1,6 @@ +user_id,session_id,page_url,timestamp +1,2001, 10:00:00 +2,2002, 10:05:00 +1,2003, 11:00:00 +3,2004, 11:05:00 +2,2005, 12:00:00 diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_03__persistent_tables/WritingPersistentTables.scala b/src/main/com/codely/lesson_05__build_your_lakehouse/video_03__persistent_tables/WritingPersistentTables.scala similarity index 91% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_03__persistent_tables/WritingPersistentTables.scala rename to src/main/com/codely/lesson_05__build_your_lakehouse/video_03__persistent_tables/WritingPersistentTables.scala index 7fbc3b9..17fa355 100644 --- a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_03__persistent_tables/WritingPersistentTables.scala +++ b/src/main/com/codely/lesson_05__build_your_lakehouse/video_03__persistent_tables/WritingPersistentTables.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_04__build_your_lakehouse.video_03__persistent_tables +package com.codely.lesson_05__build_your_lakehouse.video_03__persistent_tables import org.apache.spark.sql.{SaveMode, SparkSession} import org.apache.spark.sql.functions.{avg, explode, month, to_date} @@ -7,6 +7,7 @@ object WritingPersistentTables extends App { val spark: SparkSession = SparkSession .builder() .config("spark.hadoop.hive.metastore.uris", "thrift://localhost:9083") + .config("spark.sql.hive.metastore.jars", "builtin") .config( "spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain" @@ -15,7 +16,6 @@ object WritingPersistentTables extends App { .config("spark.hadoop.fs.s3a.secret.key", "test") .config("spark.hadoop.fs.s3a.endpoint", "http://localhost:4566") .config("spark.hadoop.fs.s3a.path.style.access", "true") - .config("spark.sql.hive.metastore.jars", "builtin") .config( "spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem" @@ -28,7 +28,7 @@ object WritingPersistentTables extends App { val productPurchasedDF = spark.read.json( - "src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_03__persistent_tables/data/purchasecompleted.json" + "src/main/com/codely/lesson_05__build_your_lakehouse/video_03__persistent_tables/data/purchasecompleted.json" ) import spark.implicits._ diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_03__persistent_tables/data/purchasecompleted.json b/src/main/com/codely/lesson_05__build_your_lakehouse/video_03__persistent_tables/data/purchasecompleted.json similarity index 100% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_03__persistent_tables/data/purchasecompleted.json rename to src/main/com/codely/lesson_05__build_your_lakehouse/video_03__persistent_tables/data/purchasecompleted.json diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_04__delta_lake/WritingDeltaTables.scala b/src/main/com/codely/lesson_05__build_your_lakehouse/video_04__delta_lake/WritingDeltaTables.scala similarity index 94% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_04__delta_lake/WritingDeltaTables.scala rename to src/main/com/codely/lesson_05__build_your_lakehouse/video_04__delta_lake/WritingDeltaTables.scala index f165c74..e0bc614 100644 --- a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_04__delta_lake/WritingDeltaTables.scala +++ b/src/main/com/codely/lesson_05__build_your_lakehouse/video_04__delta_lake/WritingDeltaTables.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_04__build_your_lakehouse.video_04__delta_lake +package com.codely.lesson_05__build_your_lakehouse.video_04__delta_lake import io.delta.tables.DeltaTable import org.apache.spark.sql.{SaveMode, SparkSession} @@ -18,7 +18,7 @@ object WritingDeltaTables extends App { spark.sparkContext.setLogLevel("WARN") val productPurchasedDF = spark.read.json( - "src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_04__delta_lake/data/purchasecompleted.json" + "src/main/com/codely/lesson_05__build_your_lakehouse/video_04__delta_lake/data/purchasecompleted.json" ) import spark.implicits._ diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_04__delta_lake/data/purchasecompleted.json b/src/main/com/codely/lesson_05__build_your_lakehouse/video_04__delta_lake/data/purchasecompleted.json similarity index 100% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/video_04__delta_lake/data/purchasecompleted.json rename to src/main/com/codely/lesson_05__build_your_lakehouse/video_04__delta_lake/data/purchasecompleted.json diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/z_practical_exercise/DeltaLakeOperations.scala b/src/main/com/codely/lesson_05__build_your_lakehouse/z_practical_exercise/DeltaLakeOperations.scala similarity index 75% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/z_practical_exercise/DeltaLakeOperations.scala rename to src/main/com/codely/lesson_05__build_your_lakehouse/z_practical_exercise/DeltaLakeOperations.scala index 6d4c067..60f7dc3 100644 --- a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/z_practical_exercise/DeltaLakeOperations.scala +++ b/src/main/com/codely/lesson_05__build_your_lakehouse/z_practical_exercise/DeltaLakeOperations.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_04__build_your_lakehouse.z_practical_exercise +package com.codely.lesson_05__build_your_lakehouse.z_practical_exercise import org.apache.spark.sql.SparkSession import io.delta.tables._ @@ -21,24 +21,23 @@ object DeltaLakeOperations extends App { // 2. val transactionsFilePath = - "src/main/scala/com/codely/lesson_04__build_your_lakehouse/z_practical_exercise/data/transactions.csv" - - val usersDF = spark.read - .option("header", "true") - .csv(transactionsFilePath) + "src/main/com/codely/lesson_05__build_your_lakehouse/z_practical_exercise/data/transactions.csv" val transactionsDF = spark.read .option("header", "true") .csv(transactionsFilePath) + val outputBasePath = + "src/main/com/codely/lesson_05__build_your_lakehouse/z_practical_exercise/output/" + // 3. - usersDF.write.format("delta").mode("overwrite").save("output/delta/users") transactionsDF.write .format("delta") .mode("overwrite") - .save("/tmp/delta/transactions") + .save(s"$outputBasePath/delta/transactions") - val transactionsDeltaTable = DeltaTable.forPath("output/delta/users") + val transactionsDeltaTable = + DeltaTable.forPath(s"$outputBasePath/delta/transactions") // 4. transactionsDeltaTable.update( diff --git a/src/main/scala/com/codely/lesson_04__build_your_lakehouse/z_practical_exercise/data/transactions.csv b/src/main/com/codely/lesson_05__build_your_lakehouse/z_practical_exercise/data/transactions.csv similarity index 100% rename from src/main/scala/com/codely/lesson_04__build_your_lakehouse/z_practical_exercise/data/transactions.csv rename to src/main/com/codely/lesson_05__build_your_lakehouse/z_practical_exercise/data/transactions.csv diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/video_01__intro_spark_streaming/IntroSparkStreaming.scala b/src/main/com/codely/lesson_06__spark_streaming/video_01__intro_spark_streaming/IntroSparkStreaming.scala similarity index 77% rename from src/main/scala/com/codely/lesson_05__spark_streaming/video_01__intro_spark_streaming/IntroSparkStreaming.scala rename to src/main/com/codely/lesson_06__spark_streaming/video_01__intro_spark_streaming/IntroSparkStreaming.scala index 7ce5c91..4c7191a 100644 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/video_01__intro_spark_streaming/IntroSparkStreaming.scala +++ b/src/main/com/codely/lesson_06__spark_streaming/video_01__intro_spark_streaming/IntroSparkStreaming.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_05__spark_streaming.video_01__intro_spark_streaming +package com.codely.lesson_06__spark_streaming.video_01__intro_spark_streaming import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{avg, window} @@ -8,27 +8,24 @@ private object IntroSparkStreaming extends App { val spark: SparkSession = SparkSession .builder() - .master("local") + .master("local[8]") .getOrCreate() spark.sparkContext.setLogLevel("WARN") + import spark.implicits._ val source = spark.readStream .format("rate") .option("rowsPerSecond", 2) .load() - import spark.implicits._ - val aggStream = source .groupBy(window($"timestamp", "10 seconds")) .agg(avg("value")) val sink = aggStream.writeStream .format("console") - //.outputMode(OutputMode.Complete()) - //.outputMode(OutputMode.Update()) - .outputMode(OutputMode.Append()) + .outputMode(OutputMode.Update()) .option("numRows", "10") .option("truncate", "false") diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/video_03__optimizations_in_streaming/SparkStreamingAggregationsWatermark.scala b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/SparkStreamingAggregationsWatermark.scala similarity index 57% rename from src/main/scala/com/codely/lesson_05__spark_streaming/video_03__optimizations_in_streaming/SparkStreamingAggregationsWatermark.scala rename to src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/SparkStreamingAggregationsWatermark.scala index 2f41d5c..f5a1275 100644 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/video_03__optimizations_in_streaming/SparkStreamingAggregationsWatermark.scala +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/SparkStreamingAggregationsWatermark.scala @@ -1,7 +1,7 @@ -package com.codely.lesson_05__spark_streaming.video_03__optimizations_in_streaming +package com.codely.lesson_06__spark_streaming.video_02__late_events -import com.codely.lesson_05__spark_streaming.video_02__spark_streaming_agg.commons.Schemas.purchasedSchema -import org.apache.spark.sql.functions.{avg, explode, month, to_timestamp} +import com.codely.lesson_06__spark_streaming.video_02__late_events.commons.Schemas.purchasedSchema +import org.apache.spark.sql.functions.{avg, explode, to_timestamp, window} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.{DataFrame, SparkSession} @@ -9,6 +9,7 @@ private object SparkStreamingAggregationsWatermark extends App { val spark: SparkSession = SparkSession .builder() + .appName("Late events") .master("local[*]") .getOrCreate() @@ -18,7 +19,7 @@ private object SparkStreamingAggregationsWatermark extends App { .format("json") .schema(purchasedSchema) .load( - "src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/streaming_agg" + "src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg" ) import spark.implicits._ @@ -28,21 +29,20 @@ private object SparkStreamingAggregationsWatermark extends App { "timestamp", to_timestamp($"timestamp", "yyyy-MM-dd'T'HH:mm:ss'Z'") ) - .withWatermark("timestamp", "15 seconds") - .select($"userId", explode($"products").as("product"), $"timestamp") + .withWatermark("timestamp", "1 hours") + .select(explode($"products").as("product"), $"timestamp") .select( - $"userId", - $"product.category", $"timestamp", - month($"timestamp").alias("month"), ($"product.price" * $"product.quantity").alias("totalSpent") ) - .groupBy($"userId", $"category", $"month", $"timestamp") + .groupBy(window($"timestamp", "24 hours")) .agg(avg("totalSpent").alias("AvgSpending")) avgSpendingPerUserDF.writeStream .format("console") .outputMode(OutputMode.Append()) + .option("numRows", 100) + .option("truncate", "false") .start() .awaitTermination() } diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/commons/Schemas.scala b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/commons/Schemas.scala similarity index 90% rename from src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/commons/Schemas.scala rename to src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/commons/Schemas.scala index 758bd11..88de6da 100644 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/commons/Schemas.scala +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/commons/Schemas.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_05__spark_streaming.video_02__spark_streaming_agg.commons +package com.codely.lesson_06__spark_streaming.video_02__late_events.commons object Schemas { diff --git a/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted.json b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted.json new file mode 100644 index 0000000..3cf108e --- /dev/null +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted.json @@ -0,0 +1,3 @@ +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T01:00:00Z", "userId": "user182", "transactionId": "trans1065", "products": [{"productId": "prod599", "quantity": 2, "description": "Augmented Reality Glasses", "category": "Wearables", "price": 549.99}, {"productId": "prod521", "quantity": 3, "description": "Smart Home Hub", "category": "Smart Home", "price": 129.99}, {"productId": "prod590", "quantity": 2, "description": "Streaming Microphone", "category": "Streaming", "price": 199.99}], "eventId": "b0d0cdaa-2547-4250-a64d-87f52c59195f"} +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T08:30:00Z", "userId": "user183", "transactionId": "trans1065", "products": [{"productId": "prod599", "quantity": 1, "description": "Augmented Reality Glasses", "category": "Wearables", "price": 549.99}, {"productId": "prod521", "quantity": 3, "description": "Smart Home Hub", "category": "Smart Home", "price": 129.99}, {"productId": "prod590", "quantity": 2, "description": "Streaming Microphone", "category": "Streaming", "price": 199.99}], "eventId": "b0d0cdaa-2547-4250-a64d-87f52c59157f"} +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T23:00:00Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} diff --git a/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_2.json b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_2.json new file mode 100644 index 0000000..9eea081 --- /dev/null +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_2.json @@ -0,0 +1 @@ +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T23:30:00Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} diff --git a/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_3.json b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_3.json new file mode 100644 index 0000000..2a6ed37 --- /dev/null +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_3.json @@ -0,0 +1 @@ +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-09T23:30:00Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} diff --git a/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_late_event.json b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_late_event.json new file mode 100644 index 0000000..9eea081 --- /dev/null +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/input/purchasecompleted_late_event.json @@ -0,0 +1 @@ +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T23:30:00Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} diff --git a/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted.json b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted.json new file mode 100644 index 0000000..3cf108e --- /dev/null +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted.json @@ -0,0 +1,3 @@ +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T01:00:00Z", "userId": "user182", "transactionId": "trans1065", "products": [{"productId": "prod599", "quantity": 2, "description": "Augmented Reality Glasses", "category": "Wearables", "price": 549.99}, {"productId": "prod521", "quantity": 3, "description": "Smart Home Hub", "category": "Smart Home", "price": 129.99}, {"productId": "prod590", "quantity": 2, "description": "Streaming Microphone", "category": "Streaming", "price": 199.99}], "eventId": "b0d0cdaa-2547-4250-a64d-87f52c59195f"} +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T08:30:00Z", "userId": "user183", "transactionId": "trans1065", "products": [{"productId": "prod599", "quantity": 1, "description": "Augmented Reality Glasses", "category": "Wearables", "price": 549.99}, {"productId": "prod521", "quantity": 3, "description": "Smart Home Hub", "category": "Smart Home", "price": 129.99}, {"productId": "prod590", "quantity": 2, "description": "Streaming Microphone", "category": "Streaming", "price": 199.99}], "eventId": "b0d0cdaa-2547-4250-a64d-87f52c59157f"} +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T23:00:00Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} diff --git a/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_2.json b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_2.json new file mode 100644 index 0000000..9eea081 --- /dev/null +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_2.json @@ -0,0 +1 @@ +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T23:30:00Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} diff --git a/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_3.json b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_3.json new file mode 100644 index 0000000..2a6ed37 --- /dev/null +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_3.json @@ -0,0 +1 @@ +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-09T23:30:00Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} diff --git a/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_late_event.json b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_late_event.json new file mode 100644 index 0000000..9eea081 --- /dev/null +++ b/src/main/com/codely/lesson_06__spark_streaming/video_02__late_events/data/streaming_agg/purchasecompleted_late_event.json @@ -0,0 +1 @@ +{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T23:30:00Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/z_practical_exercise/StreamingAnalysis.scala b/src/main/com/codely/lesson_06__spark_streaming/z_practical_exercise/StreamingAnalysis.scala similarity index 96% rename from src/main/scala/com/codely/lesson_05__spark_streaming/z_practical_exercise/StreamingAnalysis.scala rename to src/main/com/codely/lesson_06__spark_streaming/z_practical_exercise/StreamingAnalysis.scala index 72f69f8..58574d3 100644 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/z_practical_exercise/StreamingAnalysis.scala +++ b/src/main/com/codely/lesson_06__spark_streaming/z_practical_exercise/StreamingAnalysis.scala @@ -1,4 +1,4 @@ -package com.codely.lesson_05__spark_streaming.z_practical_exercise +package com.codely.lesson_06__spark_streaming.z_practical_exercise import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions._ diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/SparkStreamingAggregations.scala b/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/SparkStreamingAggregations.scala deleted file mode 100644 index 24bd7e3..0000000 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/SparkStreamingAggregations.scala +++ /dev/null @@ -1,45 +0,0 @@ -package com.codely.lesson_05__spark_streaming.video_02__spark_streaming_agg - -import com.codely.lesson_05__spark_streaming.video_02__spark_streaming_agg.commons.Schemas.purchasedSchema -import org.apache.spark.sql.functions.{avg, explode, month, to_date} -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.streaming.OutputMode - -private object SparkStreamingAggregations extends App { - - val spark: SparkSession = SparkSession - .builder() - .master("local") - .getOrCreate() - - spark.sparkContext.setLogLevel("WARN") - - val source: DataFrame = spark.readStream - .format("json") - .schema(purchasedSchema) - .load( - "src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/streaming_agg" - ) - - import spark.implicits._ - - val avgSpendingPerUserDF = source - .withColumn("date", to_date($"timestamp", "yyyy-MM-dd'T'HH:mm:ss'Z'")) - .select($"userId", explode($"products").as("product"), $"date") - .select( - $"userId", - $"product.category", - month($"date").alias("month"), - ($"product.price" * $"product.quantity").alias("totalSpent") - ) - .groupBy($"userId", $"category", $"month") - .agg(avg("totalSpent").alias("AvgSpending")) - .orderBy($"userId", $"category", $"month") - - avgSpendingPerUserDF.writeStream - .format("console") - .outputMode(OutputMode.Complete()) - //.outputMode(OutputMode.Update()) - .start() - .awaitTermination() -} diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/SparkStreamingJoins.scala b/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/SparkStreamingJoins.scala deleted file mode 100644 index 82c4e52..0000000 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/SparkStreamingJoins.scala +++ /dev/null @@ -1,53 +0,0 @@ -package com.codely.lesson_05__spark_streaming.video_02__spark_streaming_agg - -import com.codely.lesson_05__spark_streaming.video_02__spark_streaming_agg.commons.Schemas._ -import org.apache.spark.sql.functions.col -import org.apache.spark.sql.{DataFrame, SparkSession, functions} - -private object SparkStreamingJoins extends App { - - val spark: SparkSession = SparkSession - .builder() - .master("local[*]") - .getOrCreate() - - spark.sparkContext.setLogLevel("WARN") - - val productPurchasedDF: DataFrame = spark.readStream - .format("json") - .schema(purchasedSchema) - .load( - "src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/streaming_join/" - ) - - val productsDF = spark.read - .format("json") - .load( - "src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/products/products.json" - ) - - val productPurchasedFlatDF = productPurchasedDF - .filter(functions.size(col("products")) === 1) - .selectExpr( - "userId", - "timestamp", - "inline(products)" - ) - .withColumnRenamed("productId", "purchasedProductId") - .select("userId", "purchasedProductId", "timestamp") - - val joinCondition = - productPurchasedFlatDF("purchasedProductId") === productsDF("productId") - - productPurchasedFlatDF - .join( - productsDF, - joinCondition - ) - .writeStream - .format("console") - .option("truncate", "false") - .outputMode("append") - .start() - .awaitTermination() -} diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/products/products.json b/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/products/products.json deleted file mode 100644 index 06a66f6..0000000 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/products/products.json +++ /dev/null @@ -1,46 +0,0 @@ -{"productId": "prod553", "description": "Wireless Mouse - Magnetic"} -{"productId": "prod516", "description": "USB Keyboard - Multi-pack"} -{"productId": "prod520", "description": "Ergonomic Chair - Touchscreen"} -{"productId": "prod525", "description": "Smartphone Case - Improved Specs"} -{"productId": "prod551", "description": "Bluetooth Speaker - High Speed"} -{"productId": "prod537", "description": "Laptop Stand - Wireless"} -{"productId": "prod540", "description": "External Hard Drive - Energy Efficient"} -{"productId": "prod539", "description": "Wi-Fi Router - UV Protection"} -{"productId": "prod504", "description": "HD Webcam - Lightweight"} -{"productId": "prod546", "description": "Noise Cancelling Headphones - New Release"} -{"productId": "prod505", "description": "Smartwatch - Secure"} -{"productId": "prod524", "description": "Tablet Case - Water Resistant"} -{"productId": "prod518", "description": "USB-C Hub - 4K Support"} -{"productId": "prod531", "description": "E-book Reader - Ergonomic"} -{"productId": "prod558", "description": "Graphic Tablet - Gesture Control"} -{"productId": "prod554", "description": "Digital Camera - Fast Charging"} -{"productId": "prod597", "description": "Fitness Tracker - Low Power"} -{"productId": "prod511", "description": "Portable Charger - Space Saving"} -{"productId": "prod567", "description": "Smart Light Bulb - Smart Technology"} -{"productId": "prod528", "description": "Streaming Stick - VR Ready"} -{"productId": "prod572", "description": "VR Headset - Budget Friendly"} -{"productId": "prod559", "description": "Mechanical Keyboard - Voice Activated"} -{"productId": "prod578", "description": "Smart Thermostat - Heat Resistant"} -{"productId": "prod552", "description": "Computer Monitor - AI Integrated"} -{"productId": "prod555", "description": "Gaming Chair - High Resolution"} -{"productId": "prod526", "description": "Wireless Earbuds - Dual Band"} -{"productId": "prod561", "description": "Smart Door Lock - App Controlled"} -{"productId": "prod545", "description": "Action Camera - Child Safe"} -{"productId": "prod577", "description": "Compact Drone - Pet Friendly"} -{"productId": "prod574", "description": "Wireless Router - Long Lasting"} -{"productId": "prod571", "description": "Smart Light Switch - Solar Powered"} -{"productId": "prod515", "description": "Digital Notepad - Extended Warranty"} -{"productId": "prod517", "description": "Electric Kettle - High Capacity"} -{"productId": "prod573", "description": "Smart Vacuum Cleaner - Collector's Edition"} -{"productId": "prod583", "description": "Noise Cancelling Earbuds - Deluxe Package"} -{"productId": "prod514", "description": "LED Desk Lamp - High Contrast"} -{"productId": "prod564", "description": "Smartphone Stand - Energy Efficient"} -{"productId": "prod569", "description": "Smart Thermostat - Special Offer"} -{"productId": "prod507", "description": "Wireless Earphones - Limited Edition"} -{"productId": "prod543", "description": "Gaming Mouse - Portable"} -{"productId": "prod585", "description": "Gaming Keyboard - Premium Model"} -{"productId": "prod586", "description": "Solar Power Bank - Eco-friendly"} -{"productId": "prod587", "description": "Robot Building Kit - Exclusive Series"} -{"productId": "prod599", "description": "Augmented Reality Glasses - Advanced Tech"} -{"productId": "prod521", "description": "Smart Home Hub - Enhanced Features"} -{"productId": "prod590", "description": "Streaming Microphone - Multi-color"} \ No newline at end of file diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/purchasecompleted.json b/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/purchasecompleted.json deleted file mode 100644 index 3a21a6d..0000000 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/purchasecompleted.json +++ /dev/null @@ -1,21 +0,0 @@ -{"eventType": "PurchaseCompleted", "timestamp": "2024-01-11T10:41:28Z", "userId": "user165", "transactionId": "trans1817", "products": [{"productId": "prod553", "quantity": 3, "description": "Wireless Mouse", "category": "Electronics", "price": 29.99}, {"productId": "prod516", "quantity": 1, "description": "USB Keyboard", "category": "Electronics", "price": 49.99}], "eventId": "afda4d10-aacb-4451-823e-d5a1ec4eb92d"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-01-05T13:40:38Z", "userId": "user110", "transactionId": "trans1126", "products": [{"productId": "prod544", "quantity": 1, "description": "Ergonomic Chair", "category": "Office", "price": 199.99}, {"productId": "prod525", "quantity": 2, "description": "Smartphone Case", "category": "Accessories", "price": 15.99}, {"productId": "prod551", "quantity": 1, "description": "Bluetooth Speaker", "category": "Audio", "price": 89.99}, {"productId": "prod537", "quantity": 2, "description": "Laptop Stand", "category": "Office", "price": 34.99}, {"productId": "prod520", "quantity": 1, "description": "Gaming Mousepad", "category": "Gaming", "price": 29.99}], "eventId": "081c8c3e-9db0-4bff-8022-64248c8406d7"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-01-13T06:16:18Z", "userId": "user141", "transactionId": "trans1944", "products": [{"productId": "prod540", "quantity": 3, "description": "External Hard Drive", "category": "Storage", "price": 129.99}, {"productId": "prod539", "quantity": 3, "description": "Wi-Fi Router", "category": "Networking", "price": 79.99}, {"productId": "prod504", "quantity": 3, "description": "HD Webcam", "category": "Peripherals", "price": 99.99}, {"productId": "prod546", "quantity": 2, "description": "Noise Cancelling Headphones", "category": "Audio", "price": 249.99}, {"productId": "prod505", "quantity": 1, "description": "Smartwatch", "category": "Wearables", "price": 299.99}], "eventId": "d6122f53-fc62-4c06-b272-26b16d6446c1"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-09T00:52:43Z", "userId": "user157", "transactionId": "trans1064", "products": [{"productId": "prod524", "quantity": 3, "description": "Tablet Case", "category": "Accessories", "price": 24.99}, {"productId": "prod518", "quantity": 3, "description": "USB-C Hub", "category": "Peripherals", "price": 54.99}, {"productId": "prod531", "quantity": 1, "description": "E-book Reader", "category": "Electronics", "price": 129.99}], "eventId": "44e3a338-d59a-4549-bd1d-3e5364ebee37"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-12T16:03:43Z", "userId": "user114", "transactionId": "trans1387", "products": [{"productId": "prod558", "quantity": 1, "description": "Graphic Tablet", "category": "Creative Tech", "price": 199.99}, {"productId": "prod554", "quantity": 1, "description": "Digital Camera", "category": "Photography", "price": 499.99}], "eventId": "33a5c304-86df-4ab9-8cd1-9d6909439810"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-14T01:51:52Z", "userId": "user131", "transactionId": "trans1260", "products": [{"productId": "prod597", "quantity": 3, "description": "Fitness Tracker", "category": "Wearables", "price": 59.99}], "eventId": "5ec9b5d4-940c-4907-976a-2ade869d5c5e"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-05T19:55:16Z", "userId": "user158", "transactionId": "trans1313", "products": [{"productId": "prod511", "quantity": 2, "description": "Portable Charger", "category": "Gadgets", "price": 29.99}, {"productId": "prod553", "quantity": 2, "description": "Wireless Mouse", "category": "Electronics", "price": 29.99}, {"productId": "prod511", "quantity": 2, "description": "Portable Charger", "category": "Gadgets", "price": 29.99}, {"productId": "prod567", "quantity": 2, "description": "Smart Light Bulb", "category": "Smart Home", "price": 49.99}, {"productId": "prod528", "quantity": 2, "description": "Streaming Stick", "category": "Entertainment", "price": 39.99}], "eventId": "9ca1ca07-c45b-4ca3-a2f1-306345afaa4d"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-06T23:31:08Z", "userId": "user174", "transactionId": "trans1483", "products": [{"productId": "prod572", "quantity": 2, "description": "VR Headset", "category": "Gaming", "price": 349.99}, {"productId": "prod559", "quantity": 3, "description": "Mechanical Keyboard", "category": "Gaming", "price": 129.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}, {"productId": "prod578", "quantity": 2, "description": "Smart Thermostat", "category": "Smart Home", "price": 249.99}, {"productId": "prod504", "quantity": 1, "description": "HD Webcam", "category": "Peripherals", "price": 99.99}], "eventId": "f564cce9-81dd-4605-ad92-9fb5af65d5c1"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-09T02:42:41Z", "userId": "user117", "transactionId": "trans1092", "products": [{"productId": "prod552", "quantity": 1, "description": "Computer Monitor", "category": "Electronics", "price": 229.99}, {"productId": "prod555", "quantity": 3, "description": "Gaming Chair", "category": "Gaming", "price": 359.99}, {"productId": "prod526", "quantity": 1, "description": "Wireless Earbuds", "category": "Audio", "price": 79.99}, {"productId": "prod511", "quantity": 2, "description": "Portable Charger", "category": "Gadgets", "price": 29.99}, {"productId": "prod524", "quantity": 2, "description": "Tablet Case", "category": "Accessories", "price": 24.99}], "eventId": "08ef7824-8a53-4c8e-8054-cc5b08d23c2a"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T22:36:40Z", "userId": "user146", "transactionId": "trans1482", "products": [{"productId": "prod511", "quantity": 3, "description": "Portable Charger", "category": "Gadgets", "price": 29.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod545", "quantity": 2, "description": "Action Camera", "category": "Photography", "price": 299.99}], "eventId": "f6f6409e-a720-43b8-9960-e40cdf7b4a49"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-05T17:29:28Z", "userId": "user144", "transactionId": "trans1499", "products": [{"productId": "prod577", "quantity": 1, "description": "Compact Drone", "category": "Gadgets", "price": 459.99}, {"productId": "prod597", "quantity": 2, "description": "Fitness Tracker", "category": "Wearables", "price": 59.99}, {"productId": "prod574", "quantity": 2, "description": "Wireless Router", "category": "Networking", "price": 129.99}], "eventId": "effb11d3-a863-4f92-ac0e-75821c93313e"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T19:29:01Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}, {"productId": "prod561", "quantity": 3, "description": "Smart Door Lock", "category": "Smart Home", "price": 159.99}, {"productId": "prod501", "quantity": 3, "description": "Wireless Charging Pad", "category": "Gadgets", "price": 59.99}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1fad"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-06T13:50:51Z", "userId": "user136", "transactionId": "trans1551", "products": [{"productId": "prod551", "quantity": 1, "description": "Bluetooth Speaker", "category": "Audio", "price": 89.99}, {"productId": "prod511", "quantity": 3, "description": "Portable Charger", "category": "Gadgets", "price": 29.99}, {"productId": "prod552", "quantity": 2, "description": "Computer Monitor", "category": "Electronics", "price": 229.99}, {"productId": "prod585", "quantity": 1, "description": "Gaming Keyboard", "category": "Gaming", "price": 119.99}], "eventId": "64812859-6849-42d1-9602-9c634eea7edb"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-12T10:10:01Z", "userId": "user177", "transactionId": "trans1960", "products": [{"productId": "prod515", "quantity": 3, "description": "Digital Notepad", "category": "Office", "price": 199.99}], "eventId": "2be473d8-56f3-4aaa-a3b9-7ee3c9809612"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-11T23:47:06Z", "userId": "user135", "transactionId": "trans1231", "products": [{"productId": "prod517", "quantity": 1, "description": "Electric Kettle", "category": "Kitchen Appliances", "price": 79.99}, {"productId": "prod573", "quantity": 2, "description": "Smart Vacuum Cleaner", "category": "Smart Home", "price": 349.99}, {"productId": "prod505", "quantity": 2, "description": "Smartwatch", "category": "Wearables", "price": 299.99}], "eventId": "eeeb349f-9311-46b6-84a5-948b68f864ec"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-11T10:49:22Z", "userId": "user177", "transactionId": "trans1705", "products": [{"productId": "prod583", "quantity": 2, "description": "Noise Cancelling Earbuds", "category": "Audio", "price": 159.99}, {"productId": "prod514", "quantity": 2, "description": "LED Desk Lamp", "category": "Office", "price": 49.99}, {"productId": "prod564", "quantity": 1, "description": "Smartphone Stand", "category": "Accessories", "price": 19.99}], "eventId": "a4378600-2ed9-4323-97b8-a3e340dd6cb9"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-14T01:18:03Z", "userId": "user126", "transactionId": "trans1372", "products": [{"productId": "prod569", "quantity": 2, "description": "Smart Thermostat", "category": "Smart Home", "price": 249.99}], "eventId": "83c728ac-aa9d-4d79-8ab6-d2093c875a7e"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-05T08:37:27Z", "userId": "user155", "transactionId": "trans1091", "products": [{"productId": "prod507", "quantity": 3, "description": "Wireless Earphones", "category": "Audio", "price": 99.99}, {"productId": "prod543", "quantity": 3, "description": "Gaming Mouse", "category": "Gaming", "price": 59.99}, {"productId": "prod518", "quantity": 2, "description": "USB-C Hub", "category": "Peripherals", "price": 54.99}, {"productId": "prod571", "quantity": 3, "description": "Smart Light Switch", "category": "Smart Home", "price": 39.99}], "eventId": "336b1f88-9321-492e-99ba-b78269b1cfc5"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-03T22:54:28Z", "userId": "user161", "transactionId": "trans1300", "products": [{"productId": "prod555", "quantity": 1, "description": "Gaming Chair", "category": "Gaming", "price": 359.99}, {"productId": "prod586", "quantity": 2, "description": "Solar Power Bank", "category": "Gadgets", "price": 89.99}, {"productId": "prod554", "quantity": 1, "description": "Digital Camera", "category": "Photography", "price": 499.99}, {"productId": "prod505", "quantity": 2, "description": "Smartwatch", "category": "Wearables", "price": 299.99}, {"productId": "prod587", "quantity": 1, "description": "Robot Building Kit", "category": "Toys & Education", "price": 149.99}], "eventId": "ed3599f6-6428-4d75-ae20-7769e4741fb4"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T08:31:06Z", "userId": "user182", "transactionId": "trans1065", "products": [{"productId": "prod599", "quantity": 2, "description": "Augmented Reality Glasses", "category": "Wearables", "price": 549.99}, {"productId": "prod521", "quantity": 3, "description": "Smart Home Hub", "category": "Smart Home", "price": 129.99}, {"productId": "prod590", "quantity": 2, "description": "Streaming Microphone", "category": "Streaming", "price": 199.99}], "eventId": "b0d0cdaa-2547-4250-a64d-87f52c59195f"} -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T08:31:06Z", "userId": "user183", "transactionId": "trans1065", "products": [{"productId": "prod599", "quantity": null, "description": "Augmented Reality Glasses", "category": "Wearables", "price": 549.99}, {"productId": "prod521", "quantity": 3, "description": "Smart Home Hub", "category": "Smart Home", "price": 129.99}, {"productId": "prod590", "quantity": 2, "description": "Streaming Microphone", "category": "Streaming", "price": 199.99}], "eventId": "b0d0cdaa-2547-4250-a64d-87f52c59157f"} diff --git a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/purchasecompleted_2.json b/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/purchasecompleted_2.json deleted file mode 100644 index 7b8826c..0000000 --- a/src/main/scala/com/codely/lesson_05__spark_streaming/video_02__spark_streaming_agg/data/purchasecompleted_2.json +++ /dev/null @@ -1 +0,0 @@ -{"eventType": "PurchaseCompleted", "timestamp": "2024-02-07T19:29:01Z", "userId": "user196", "transactionId": "trans1050", "products": [{"productId": "prod571", "quantity": 1, "description": "Smart Light Switch", "category": "Smart Home", "price": 9000}], "eventId": "e3386641-c796-4d1e-ba25-db7be12c1faz"} diff --git a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_01__kafka_integration/KafkaIntegration.scala b/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_01__kafka_integration/KafkaIntegration.scala deleted file mode 100644 index b38fa7a..0000000 --- a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_01__kafka_integration/KafkaIntegration.scala +++ /dev/null @@ -1,58 +0,0 @@ -package com.codely.lesson_06_spark_streaming_kafka.video_01__kafka_integration - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.functions.{avg, col, explode, from_json, month, to_date, to_timestamp} -import com.codely.lesson_06_spark_streaming_kafka.video_01__kafka_integration.commons.Schemas -import org.apache.spark.sql.streaming.OutputMode - -object KafkaIntegration extends App { - - val spark = SparkSession - .builder() - .appName("kafkaIntegration") - .master("local[*]") - .getOrCreate() - - spark.sparkContext.setLogLevel("WARN") - - val kafkaDF = spark.readStream - .format("kafka") - .option("kafka.bootstrap.servers", "localhost:9092") - .option("startingOffsets", "earliest") - .option("subscribe", "topic-events") - .load() - .select( - from_json(col("value").cast("string"), Schemas.purchasedSchema) - .as("value") - ) - .select("value.*") - - import spark.implicits._ - - kafkaDF.writeStream - .format("console") - .outputMode(OutputMode.Update()) - .start() - .awaitTermination() - - val avgSpendingPerUserDF = kafkaDF - .withColumn( - "timestamp", - to_timestamp($"timestamp", "yyyy-MM-dd'T'HH:mm:ss.SSS'Z'") - ) - .select($"userId", explode($"products").as("product"), $"timestamp") - .select( - $"userId", - $"product.category", - month($"timestamp").alias("month"), - ($"product.price" * $"product.quantity").alias("totalSpent") - ) - .groupBy($"userId", $"category", $"month") - .agg(avg("totalSpent").alias("AvgSpending")) - - avgSpendingPerUserDF.writeStream - .format("console") - .outputMode(OutputMode.Update) - .start() - .awaitTermination() -} diff --git a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_01__kafka_integration/commons/Schemas.scala b/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_01__kafka_integration/commons/Schemas.scala deleted file mode 100644 index 6582bde..0000000 --- a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_01__kafka_integration/commons/Schemas.scala +++ /dev/null @@ -1,29 +0,0 @@ -package com.codely.lesson_06_spark_streaming_kafka.video_01__kafka_integration.commons - -object Schemas { - - import org.apache.spark.sql.types._ - - private val productType = new StructType() - .add("productId", StringType) - .add("quantity", IntegerType) - .add("description", StringType) - .add("category", StringType) - .add("price", DoubleType) - - val purchasedSchema: StructType = new StructType() - .add("eventType", StringType) - .add("timestamp", StringType) - .add("userId", StringType) - .add("transactionId", StringType) - .add("products", ArrayType(productType)) - .add("eventId", StringType) - - val viewedSchema: StructType = new StructType() - .add("eventType", StringType) - .add("timestamp", StringType) - .add("userId", StringType) - .add("productId", StringType) - .add("eventId", StringType) - -} diff --git a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/DeploySparkApp.scala b/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/DeploySparkApp.scala deleted file mode 100644 index 40cf05f..0000000 --- a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/DeploySparkApp.scala +++ /dev/null @@ -1,62 +0,0 @@ -package com.codely.lesson_06_spark_streaming_kafka.video_02__deploy_application - -import com.codely.lesson_07_spark_optimize_and_monitoring.video_01__deploy_application.commons.Schemas -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.functions._ -import org.apache.spark.sql.streaming.OutputMode - -object DeploySparkApp { - def main(args: Array[String]): Unit = { - val spark = SparkSession - .builder() - .appName("DeploySparkApp") - .enableHiveSupport() - .getOrCreate() - - val kafkaDF = spark.readStream - .format("kafka") - .option("kafka.bootstrap.servers", "172.18.0.4:9092") - .option("startingOffsets", "earliest") - .option("subscribe", "topic-events") - .load() - .select( - from_json(col("value").cast("string"), Schemas.purchasedSchema) - .as("value") - ) - .select("value.*") - - import spark.implicits._ - - val avgSpendingPerUserDF = kafkaDF - .withColumn("date", to_date($"timestamp", "yyyy-MM-dd'T'HH:mm:ss'Z'")) - .select($"userId", explode($"products").as("product"), $"date") - .select( - $"userId", - $"product.category", - month($"date").alias("month"), - ($"product.price" * $"product.quantity").alias("totalSpent") - ) - .groupBy($"userId", $"category", $"month") - .agg(avg("totalSpent").alias("AvgSpending")) - - spark.sql(""" - CREATE TABLE IF NOT EXISTS avg_spending ( - userId STRING, - category STRING, - month INT, - AvgSpending DOUBLE - ) - USING delta - LOCATION 's3a://my-bucket/avg_spending' - """) - - avgSpendingPerUserDF.writeStream - .format("delta") - .option("checkpointLocation", "s3a://my-bucket/checkpoint") - .option("path", "s3a://my-bucket/avg_spending") - .outputMode(OutputMode.Complete()) - .start() - .awaitTermination() - - } -} diff --git a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/commons/Schemas.scala b/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/commons/Schemas.scala deleted file mode 100644 index 348a1d7..0000000 --- a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/commons/Schemas.scala +++ /dev/null @@ -1,29 +0,0 @@ -package com.codely.lesson_07_spark_optimize_and_monitoring.video_01__deploy_application.commons - -object Schemas { - - import org.apache.spark.sql.types._ - - private val productType = new StructType() - .add("productId", StringType) - .add("quantity", IntegerType) - .add("description", StringType) - .add("category", StringType) - .add("price", DoubleType) - - val purchasedSchema: StructType = new StructType() - .add("eventType", StringType) - .add("timestamp", StringType) - .add("userId", StringType) - .add("transactionId", StringType) - .add("products", ArrayType(productType)) - .add("eventId", StringType) - - val viewedSchema: StructType = new StructType() - .add("eventType", StringType) - .add("timestamp", StringType) - .add("userId", StringType) - .add("productId", StringType) - .add("eventId", StringType) - -} diff --git a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/spark-submit.sh b/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/spark-submit.sh deleted file mode 100644 index 24bd84d..0000000 --- a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/video_02__deploy_application/spark-submit.sh +++ /dev/null @@ -1,20 +0,0 @@ -export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/jre/ - - spark/bin/spark-submit \ - --class com.codely.lesson_06_spark_streaming_kafka.video_02__deploy_application.DeploySparkApp \ - --deploy-mode client \ - --master spark://spark-master:7077 \ - --conf spark.sql.uris=thrift://hive-metastore:9083 \ - --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension \ - --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \ - --conf spark.hadoop.fs.s3a.access.key=test \ - --conf spark.hadoop.fs.s3a.secret.key=test \ - --conf spark.hadoop.fs.s3a.endpoint=http://s3-storage:4566 \ - --conf spark.hadoop.fs.s3a.path.style.access=true \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.driver.memory=1g \ - --conf spark.executor.memory=1g \ - --conf spark.executor.cores=1 \ - --verbose \ - --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,io.delta:delta-spark_2.12:3.1.0,org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.375 \ - spark-apps/spark-for-programmers-course-assembly-0.1.0-SNAPSHOT.jar \ No newline at end of file diff --git a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/z_practical_exercise/KafkaSparkStreamingApp.scala b/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/z_practical_exercise/KafkaSparkStreamingApp.scala deleted file mode 100644 index 42250bc..0000000 --- a/src/main/scala/com/codely/lesson_06_spark_streaming_kafka/z_practical_exercise/KafkaSparkStreamingApp.scala +++ /dev/null @@ -1,36 +0,0 @@ -package com.codely.lesson_06_spark_streaming_kafka.z_practical_exercise - -import org.apache.spark.sql.SparkSession - -object KafkaSparkStreamingApp extends App { - - val spark = SparkSession - .builder() - .appName("KafkaSparkStreamingApp") - .master("local[*]") - .getOrCreate() - - spark.sparkContext.setLogLevel("WARN") - - val kafkaDF = spark.readStream - .format("kafka") - .option("kafka.bootstrap.servers", "172.18.0.4:9092") - .option("subscribe", "topic-events") - .option("startingOffsets", "earliest") - .load() - - import spark.implicits._ - val messagesDF = kafkaDF.selectExpr("CAST(value AS STRING)").as[String] - - val wordsDF = messagesDF - .flatMap(_.split(" ")) - .groupBy("value") - .count() - - val query = wordsDF.writeStream - .outputMode("update") - .format("console") - .start() - - query.awaitTermination() -} diff --git a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_01__data_sources/SQSSparkReceiver.scala b/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_01__data_sources/SQSSparkReceiver.scala deleted file mode 100644 index 160bef0..0000000 --- a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_01__data_sources/SQSSparkReceiver.scala +++ /dev/null @@ -1,54 +0,0 @@ -package com.codely.lesson_07_spark_streaming_sqs.video_01__data_sources - -import com.amazonaws.client.builder.AwsClientBuilder -import com.amazonaws.services.sqs.{AmazonSQS, AmazonSQSClientBuilder} -import com.amazonaws.services.sqs.model.{DeleteMessageRequest, ReceiveMessageRequest} -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.receiver.Receiver - -import scala.collection.JavaConverters._ - -class SQSSparkReceiver(endpoint: String, region: String, queueUrl: String) - extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { - - private var sqsClient: AmazonSQS = _ - def onStart(): Unit = { - - sqsClient = AmazonSQSClientBuilder - .standard() - .withEndpointConfiguration( - new AwsClientBuilder.EndpointConfiguration( - endpoint, - region - ) - ) - .build() - - new Thread("SQS Receiver") { - override def run() { - receive() - } - }.start() - } - - def onStop(): Unit = { - // Any necessary cleanup - } - - private def receive(): Unit = { - while (!isStopped()) { - val request = new ReceiveMessageRequest(queueUrl) - .withMaxNumberOfMessages(10) - .withWaitTimeSeconds(20) - - val messages = sqsClient.receiveMessage(request).getMessages.asScala - - for (message <- messages) { - store(message.getBody) - val deleteRequest = - new DeleteMessageRequest(queueUrl, message.getReceiptHandle) - sqsClient.deleteMessage(deleteRequest) - } - } - } -} diff --git a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_02__sqs_integration/SQSReceiverSparkApp.scala b/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_02__sqs_integration/SQSReceiverSparkApp.scala deleted file mode 100644 index 3c66b44..0000000 --- a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_02__sqs_integration/SQSReceiverSparkApp.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.codely.lesson_07_spark_streaming_sqs.video_02__sqs_integration - -import com.amazonaws.client.builder.AwsClientBuilder -import com.amazonaws.services.sqs.{AmazonSQS, AmazonSQSClientBuilder} -import org.apache.spark.SparkConf -import org.apache.spark.streaming.{Seconds, StreamingContext} - -//noinspection ScalaDeprecation -object SQSReceiverSparkApp extends App { - private val sqsEndpoint = "http://localhost:4566" - private val region = "us-east-1" - private val queueUrl = - "http://sqs.us-east-1.localhost.localstack.cloud:4566/000000000000/send_welcome_email_on_user_registered" - - val conf = - new SparkConf().setAppName("SQSReceiverSparkApp").setMaster("local[*]") - - val ssc = new StreamingContext(conf, Seconds(1)) - - val sqsClient: AmazonSQS = AmazonSQSClientBuilder - .standard() - .withEndpointConfiguration( - new AwsClientBuilder.EndpointConfiguration( - "http://localhost:4566", - "us-east-1" - ) - ) - .build() - - val receiver = new SQSSparkReceiver(sqsEndpoint, region, queueUrl) - val messages = ssc.receiverStream(receiver) - messages.print() - - ssc.start() - ssc.awaitTermination() - -} diff --git a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_02__sqs_integration/SQSSparkReceiver.scala b/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_02__sqs_integration/SQSSparkReceiver.scala deleted file mode 100644 index a8960d0..0000000 --- a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/video_02__sqs_integration/SQSSparkReceiver.scala +++ /dev/null @@ -1,54 +0,0 @@ -package com.codely.lesson_07_spark_streaming_sqs.video_02__sqs_integration - -import com.amazonaws.client.builder.AwsClientBuilder -import com.amazonaws.services.sqs.{AmazonSQS, AmazonSQSClientBuilder} -import com.amazonaws.services.sqs.model.{DeleteMessageRequest, ReceiveMessageRequest} -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.receiver.Receiver - -import scala.collection.JavaConverters._ - -//noinspection ScalaDeprecation -class SQSSparkReceiver(endpoint: String, region: String, queueUrl: String) - extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { - private var sqsClient: AmazonSQS = _ - def onStart(): Unit = { - - sqsClient = AmazonSQSClientBuilder - .standard() - .withEndpointConfiguration( - new AwsClientBuilder.EndpointConfiguration( - endpoint, - region - ) - ) - .build() - - new Thread("SQS Receiver") { - override def run() { - receive() - } - }.start() - } - - def onStop(): Unit = { - // Any necessary cleanup - } - - private def receive(): Unit = { - while (!isStopped()) { - val request = new ReceiveMessageRequest(queueUrl) - .withMaxNumberOfMessages(10) - .withWaitTimeSeconds(20) - - val messages = sqsClient.receiveMessage(request).getMessages.asScala - - for (message <- messages) { - store(message.getBody) - val deleteRequest = - new DeleteMessageRequest(queueUrl, message.getReceiptHandle) - sqsClient.deleteMessage(deleteRequest) - } - } - } -} diff --git a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/z_practical_exercise/RabbitMQReceiver.scala b/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/z_practical_exercise/RabbitMQReceiver.scala deleted file mode 100644 index 7b908dc..0000000 --- a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/z_practical_exercise/RabbitMQReceiver.scala +++ /dev/null @@ -1,44 +0,0 @@ -package com.codely.lesson_07_spark_streaming_sqs.z_practical_exercise - -import org.apache.spark.storage.StorageLevel -import org.apache.spark.streaming.receiver.Receiver -import com.rabbitmq.client.{AMQP, Connection, ConnectionFactory, DefaultConsumer, Envelope, Channel} - -class RabbitMQReceiver(queueName: String, host: String, port: Int) - extends Receiver[String](StorageLevel.MEMORY_AND_DISK_2) { - - @transient var connection: Connection = _ - @transient var channel: Channel = _ - - def onStart(): Unit = { - val factory = new ConnectionFactory() - factory.setHost(host) - factory.setPort(port) - connection = factory.newConnection() - channel = connection.createChannel() - channel.queueDeclare(queueName, true, false, false, null) - - val consumer = new DefaultConsumer(channel) { - override def handleDelivery( - consumerTag: String, - envelope: Envelope, - properties: AMQP.BasicProperties, - body: Array[Byte] - ): Unit = { - val message = new String(body, "UTF-8") - store(message) - } - } - - channel.basicConsume(queueName, true, consumer) - } - - def onStop(): Unit = { - if (channel != null) { - channel.close() - } - if (connection != null) { - connection.close() - } - } -} diff --git a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/z_practical_exercise/RabbitMQStreamingApp.scala b/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/z_practical_exercise/RabbitMQStreamingApp.scala deleted file mode 100644 index 67da229..0000000 --- a/src/main/scala/com/codely/lesson_07_spark_streaming_sqs/z_practical_exercise/RabbitMQStreamingApp.scala +++ /dev/null @@ -1,25 +0,0 @@ -package com.codely.lesson_07_spark_streaming_sqs.z_practical_exercise - -import org.apache.spark.SparkConf -import org.apache.spark.streaming.{Seconds, StreamingContext} - -//noinspection ScalaDeprecation -object RabbitMQStreamingApp { - def main(args: Array[String]): Unit = { - val conf = - new SparkConf().setAppName("RabbitMQStreamingApp").setMaster("local[*]") - val ssc = new StreamingContext(conf, Seconds(5)) - - val stream = - ssc.receiverStream(new RabbitMQReceiver("spark-queue", "localhost", 5672)) - - stream.foreachRDD { rdd => - rdd.foreach { message => - println(s"Received message: $message") - } - } - - ssc.start() - ssc.awaitTermination() - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/AvgSpendingApp.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/AvgSpendingApp.scala deleted file mode 100644 index 71e9762..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/AvgSpendingApp.scala +++ /dev/null @@ -1,26 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.app - -import com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.job.AvgSpendingJob -import com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.service.{DeltaTableWriter, JDBCReader} -import com.typesafe.config.{Config, ConfigFactory} -import org.apache.spark.sql.SparkSession - -object AvgSpendingApp extends App { - - private val appName: String = "Avg-spending-app" - private val config: Config = ConfigFactory.load().getConfig(appName) - - implicit val spark: SparkSession = SparkSession - .builder() - .appName(appName) - .enableHiveSupport() - .getOrCreate() - - private val reader = JDBCReader() - private val deltaWriter = DeltaTableWriter() - - val job = AvgSpendingJob(config, reader, deltaWriter) - job.run() - spark.stop() - -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/job/AvgSpendingJob.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/job/AvgSpendingJob.scala deleted file mode 100644 index 3c515d5..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/job/AvgSpendingJob.scala +++ /dev/null @@ -1,37 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.job - -import com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.service.{DeltaTableWriter, JDBCReader} -import com.typesafe.config.Config -import org.apache.spark.sql.{DataFrame, SparkSession} - -case class AvgSpendingJob( - config: Config, - reader: JDBCReader, - writer: DeltaTableWriter -)(implicit spark: SparkSession) { - - def run(): Unit = { - - val data = readDataFromKafka() - val avgSpendingPerUserDF = calculateSumByName(data) - writeToDelta(avgSpendingPerUserDF) - } - - private def readDataFromKafka(): DataFrame = { - reader.readFromJDBC(config.getString("jdbc.url")) - } - - private def calculateSumByName(data: DataFrame): DataFrame = { - data - .groupBy("name") - .sum("value") - .withColumnRenamed("sum(value)", "total_spending") - } - - private def writeToDelta(dataFrame: DataFrame) = { - writer.writeToDeltaTable( - dataFrame, - config.getString("delta.path") - ) - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/service/DeltaTableWriter.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/service/DeltaTableWriter.scala deleted file mode 100644 index bf018d2..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/service/DeltaTableWriter.scala +++ /dev/null @@ -1,12 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.service - -import org.apache.spark.sql.DataFrame - -case class DeltaTableWriter() { - def writeToDeltaTable( - df: DataFrame, - path: String - ): Unit = { - df.write.mode("overwrite").format("delta").save(path) - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/service/JDBCReader.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/service/JDBCReader.scala deleted file mode 100644 index 50aa8a3..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/service/JDBCReader.scala +++ /dev/null @@ -1,17 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.service - -import org.apache.spark.sql.{DataFrame, SparkSession} - -case class JDBCReader(implicit - spark: SparkSession -) { - def readFromJDBC(url: String): DataFrame = { - spark.read - .format("jdbc") - .option("url", url) - .option("dbtable", "example_table") - .option("user", "admin") - .option("password", "secret") - .load() - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/app/AvgSpendingApp.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/app/AvgSpendingApp.scala deleted file mode 100644 index b94561e..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/app/AvgSpendingApp.scala +++ /dev/null @@ -1,25 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.app - -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.job.AvgSpendingJob -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.service.{DeltaTableWriter, kafkaReader} -import com.typesafe.config.{Config, ConfigFactory} -import org.apache.spark.sql.SparkSession - -object AvgSpendingApp extends App { - - private val appName: String = "Avg-spending-app" - private val config: Config = ConfigFactory.load().getConfig(appName) - - implicit val spark: SparkSession = SparkSession - .builder() - .appName(appName) - .enableHiveSupport() - .getOrCreate() - - private val reader = kafkaReader() - private val writer = DeltaTableWriter() - - AvgSpendingJob(config, reader, writer).run() - spark.stop() - -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/commons/Schemas.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/commons/Schemas.scala deleted file mode 100644 index be89cfc..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/commons/Schemas.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.commons - -object Schemas { - - import org.apache.spark.sql.types._ - - private val productType = new StructType() - .add("productId", StringType) - .add("quantity", IntegerType) - .add("description", StringType) - .add("category", StringType) - .add("price", DoubleType) - - val purchasedSchema: StructType = new StructType() - .add("eventType", StringType) - .add("timestamp", StringType) - .add("userId", StringType) - .add("transactionId", StringType) - .add("products", ArrayType(productType)) - .add("eventId", StringType) - -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/AvgSpendingJob.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/AvgSpendingJob.scala deleted file mode 100644 index 357df19..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/AvgSpendingJob.scala +++ /dev/null @@ -1,40 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.job - -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.service.{AvgSpending, DeltaTableWriter, kafkaReader} -import com.typesafe.config.Config -import org.apache.spark.sql.{DataFrame, SparkSession} - -case class AvgSpendingJob( - config: Config, - kafkaService: kafkaReader, - deltaService: DeltaTableWriter -)(implicit spark: SparkSession) { - - def run(): Unit = { - - val data = readDataFromKafka() - val avgSpendingPerUserDF = calculateAvgSpending(data) - - val query = writeDataToDelta(avgSpendingPerUserDF) - query.awaitTermination() - } - - private def readDataFromKafka(): DataFrame = { - kafkaService.readFromKafka( - config.getString("kafka.server"), - config.getString("kafka.topic") - ) - } - - protected def calculateAvgSpending(data: DataFrame): DataFrame = { - AvgSpending.calculate(data) - } - - private def writeDataToDelta(dataFrame: DataFrame) = { - deltaService.writeToDeltaTable( - dataFrame, - config.getString("delta.path"), - config.getString("delta.checkpointPath") - ) - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/AvgSpending.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/AvgSpending.scala deleted file mode 100644 index f9fcc87..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/AvgSpending.scala +++ /dev/null @@ -1,32 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.service - -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.commons.Schemas -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{avg, col, explode, from_json, month, to_date} - -object AvgSpending { - - def calculate(dataFrame: DataFrame): DataFrame = { - - dataFrame - .select( - from_json(col("value").cast("string"), Schemas.purchasedSchema) - .as("value") - ) - .select("value.*") - .withColumn("date", to_date(col("timestamp"), "yyyy-MM-dd'T'HH:mm:ss'Z'")) - .select( - col("userId"), - explode(col("products")).as("product"), - col("date") - ) - .select( - col("userId"), - col("product.category"), - month(col("date")).alias("month"), - (col("product.price") * col("product.quantity")).alias("totalSpent") - ) - .groupBy(col("userId"), col("category"), col("month")) - .agg(avg("totalSpent").alias("AvgSpending")) - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/DeltaTableWriter.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/DeltaTableWriter.scala deleted file mode 100644 index 7a9c7e2..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/DeltaTableWriter.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.service - -import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} - -case class DeltaTableWriter() { - def writeToDeltaTable( - df: DataFrame, - path: String, - checkpointPath: String - ): StreamingQuery = { - df.writeStream - .format("delta") - .option("checkpointLocation", checkpointPath) - .option("path", path) - .outputMode(OutputMode.Complete()) - .start() - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/kafkaReader.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/kafkaReader.scala deleted file mode 100644 index 8f80d0c..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/service/kafkaReader.scala +++ /dev/null @@ -1,18 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.service - -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.commons.Schemas -import org.apache.spark.sql.{DataFrame, SparkSession} -import org.apache.spark.sql.functions._ - -case class kafkaReader(implicit - spark: SparkSession -) { - def readFromKafka(bootstrapServers: String, topic: String): DataFrame = { - spark.read - .format("kafka") - .option("kafka.bootstrap.servers", bootstrapServers) - .option("startingOffsets", "earliest") - .option("subscribe", topic) - .load() - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/app/AvgSpendingApp.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/app/AvgSpendingApp.scala deleted file mode 100644 index e9fde26..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/app/AvgSpendingApp.scala +++ /dev/null @@ -1,25 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.z_practical_exercise.app - -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.job.AvgSpendingJob -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.service.{DeltaTableWriter, kafkaReader} -import com.typesafe.config.{Config, ConfigFactory} -import org.apache.spark.sql.SparkSession - -object AvgSpendingApp extends App { - - private val appName: String = "Avg-spending-app" - private val config: Config = ConfigFactory.load().getConfig(appName) - - implicit val spark: SparkSession = SparkSession - .builder() - .appName(appName) - .enableHiveSupport() - .getOrCreate() - - private val reader = kafkaReader() - private val writer = DeltaTableWriter() - - AvgSpendingJob(config, reader, writer).run() - spark.stop() - -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/commons/Schemas.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/commons/Schemas.scala deleted file mode 100644 index fd2d77f..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/commons/Schemas.scala +++ /dev/null @@ -1,22 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.z_practical_exercise.commons - -object Schemas { - - import org.apache.spark.sql.types._ - - private val productType = new StructType() - .add("productId", StringType) - .add("quantity", IntegerType) - .add("description", StringType) - .add("category", StringType) - .add("price", DoubleType) - - val purchasedSchema: StructType = new StructType() - .add("eventType", StringType) - .add("timestamp", StringType) - .add("userId", StringType) - .add("transactionId", StringType) - .add("products", ArrayType(productType)) - .add("eventId", StringType) - -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/AvgSpendingJob.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/AvgSpendingJob.scala deleted file mode 100644 index 4585fa5..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/AvgSpendingJob.scala +++ /dev/null @@ -1,38 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.z_practical_exercise.job - -import com.codely.lesson_08_tests_in_spark.z_practical_exercise.service.{AvgSpendingCalculator, DeltaTableWriter, kafkaReader} -import com.typesafe.config.Config -import org.apache.spark.sql.{DataFrame, SparkSession} - -case class AvgSpendingJob( - config: Config, - kafkaService: kafkaReader, - deltaService: DeltaTableWriter -)(implicit - spark: SparkSession, - avgSpendingCalculator: AvgSpendingCalculator[DataFrame] -) { - - def run(): Unit = { - val data = readDataFromKafka() - val avgSpendingPerUserDF = avgSpendingCalculator.calculate(data) - - val query = writeDataToDelta(avgSpendingPerUserDF) - query.awaitTermination() - } - - private def readDataFromKafka(): DataFrame = { - kafkaService.readFromKafka( - config.getString("kafka.server"), - config.getString("kafka.topic") - ) - } - - private def writeDataToDelta(dataFrame: DataFrame) = { - deltaService.writeToDeltaTable( - dataFrame, - config.getString("delta.path"), - config.getString("delta.checkpointPath") - ) - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/AvgSpendingFunction.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/AvgSpendingFunction.scala deleted file mode 100644 index 51a779d..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/AvgSpendingFunction.scala +++ /dev/null @@ -1,43 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.z_practical_exercise.service - -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.commons.Schemas -import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{avg, col, explode, from_json, month, to_date} - -import scala.language.implicitConversions - -trait AvgSpendingCalculator[T] { - def calculate(dataFrame: DataFrame): DataFrame -} - -object AvgSpendingCalculator { - def apply[T](implicit - instance: AvgSpendingCalculator[T] - ): AvgSpendingCalculator[T] = instance -} - -object AvgSpendingFunction extends AvgSpendingCalculator[DataFrame] { - - override def calculate(dataFrame: DataFrame): DataFrame = { - dataFrame - .select( - from_json(col("value").cast("string"), Schemas.purchasedSchema) - .as("value") - ) - .select("value.*") - .withColumn("date", to_date(col("timestamp"), "yyyy-MM-dd'T'HH:mm:ss'Z'")) - .select( - col("userId"), - explode(col("products")).as("product"), - col("date") - ) - .select( - col("userId"), - col("product.category"), - month(col("date")).alias("month"), - (col("product.price") * col("product.quantity")).alias("totalSpent") - ) - .groupBy(col("userId"), col("category"), col("month")) - .agg(avg("totalSpent").alias("AvgSpending")) - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/DeltaTableWriter.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/DeltaTableWriter.scala deleted file mode 100644 index 0554539..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/DeltaTableWriter.scala +++ /dev/null @@ -1,19 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.z_practical_exercise.service - -import org.apache.spark.sql.{DataFrame, Row} -import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} - -case class DeltaTableWriter() { - def writeToDeltaTable( - df: DataFrame, - path: String, - checkpointPath: String - ): StreamingQuery = { - df.writeStream - .format("delta") - .option("checkpointLocation", checkpointPath) - .option("path", path) - .outputMode(OutputMode.Complete()) - .start() - } -} diff --git a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/kafkaReader.scala b/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/kafkaReader.scala deleted file mode 100644 index dc34456..0000000 --- a/src/main/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/service/kafkaReader.scala +++ /dev/null @@ -1,16 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.z_practical_exercise.service - -import org.apache.spark.sql.{DataFrame, SparkSession} - -case class kafkaReader(implicit - spark: SparkSession -) { - def readFromKafka(bootstrapServers: String, topic: String): DataFrame = { - spark.read - .format("kafka") - .option("kafka.bootstrap.servers", bootstrapServers) - .option("startingOffsets", "earliest") - .option("subscribe", topic) - .load() - } -} diff --git a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_01__how_spark_works/HowSparkWorks.scala b/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_01__how_spark_works/HowSparkWorks.scala deleted file mode 100644 index d1bea61..0000000 --- a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_01__how_spark_works/HowSparkWorks.scala +++ /dev/null @@ -1,27 +0,0 @@ -package com.codely.lesson_09_basics_spark_optimization.video_01__how_spark_works - -import org.apache.spark.rdd.RDD - -object HowSparkWorks extends App { - - // 1. docker exec -it spark-kafka-cluster-spark-master-1 bash - // 2. ./bin/spark-shell --master spark://spark-master:7077 --total-executor-cores 2 --executor-memory 512m - - val spark = org.apache.spark.sql.SparkSession.builder - .master("local") - .appName("Spark Example") - .getOrCreate() - - val sc = spark.sparkContext - val numbers: RDD[Int] = sc.parallelize(1 to 1000) - numbers.count() - - // localhost:4040 - - val doubledNumbers = numbers.map(_ * 2) - doubledNumbers.count() - - val groupedNumbers = doubledNumbers.groupBy(_ % 2) - groupedNumbers.count() - -} diff --git a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_02__reading_query_plans/QueryPlans.scala b/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_02__reading_query_plans/QueryPlans.scala deleted file mode 100644 index 09e1401..0000000 --- a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_02__reading_query_plans/QueryPlans.scala +++ /dev/null @@ -1,42 +0,0 @@ -package com.codely.lesson_09_basics_spark_optimization.video_02__reading_query_plans - -object QueryPlans extends App { - - val spark = org.apache.spark.sql.SparkSession.builder - .master("local[2]") - .appName("Spark Example") - .getOrCreate() - - spark.sparkContext.setLogLevel("WARN") - - val sc = spark.sparkContext - - val rangeDs = spark.range(1000) - rangeDs.explain() - - val rangeDsFiltered = rangeDs.selectExpr("id * 2 as id") - rangeDsFiltered.explain() - import spark.implicits._ - - val anotherDs = Seq( - (0, "zero"), - (2, "two"), - (4, "four"), - (6, "six"), - (8, "eight") - ).toDF("id", "name") - - val joinedDs = rangeDsFiltered.join(anotherDs, "id") - joinedDs.explain() - joinedDs.show() - - val agg = joinedDs.selectExpr("sum(id)") - agg.explain() - agg.show() - - val bigRangeDs = spark.range(2000000000) - val anotherBigDs = spark.range(2000000000) - val joinedBigDs = bigRangeDs.join(anotherBigDs, "id") - joinedBigDs.explain() - -} diff --git a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_03__monitor_your_app_with_grafana/dashboard.json b/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_03__monitor_your_app_with_grafana/dashboard.json deleted file mode 100644 index fdd97e7..0000000 --- a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_03__monitor_your_app_with_grafana/dashboard.json +++ /dev/null @@ -1,715 +0,0 @@ -{ - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": 1, - "links": [], - "panels": [ - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 4, - "x": 0, - "y": 0 - }, - "id": 4, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "metrics_master_workers_Value", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Total Workers", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 4, - "x": 4, - "y": 0 - }, - "id": 6, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "metrics_master_aliveWorkers_Value", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Workers alive", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 7, - "x": 8, - "y": 0 - }, - "id": 1, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" - }, - "pluginVersion": "11.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(instance) (metrics_worker_memUsed_MB_Number)", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Workers memory used", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 8, - "x": 15, - "y": 0 - }, - "id": 2, - "options": { - "minVizHeight": 75, - "minVizWidth": 75, - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true, - "sizing": "auto" - }, - "pluginVersion": "11.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(instance) (metrics_worker_coresUsed_Value)", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Workers cores used", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 4, - "x": 0, - "y": 7 - }, - "id": 5, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "metrics_master_waitingApps_Value", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Waiting apps", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "thresholds" - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 4, - "x": 4, - "y": 7 - }, - "id": 8, - "options": { - "colorMode": "value", - "graphMode": "area", - "justifyMode": "auto", - "orientation": "auto", - "percentChangeColorMode": "standard", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showPercentChange": false, - "textMode": "auto", - "wideLayout": true - }, - "pluginVersion": "11.1.0", - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "fdqiu5rtqvfuoe" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "metrics_master_apps_Number", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Total apps", - "type": "stat" - }, - { - "datasource": { - "type": "prometheus", - "uid": "bdqg3svckp9fkd" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [ - { - "matcher": { - "id": "byName", - "options": "app-20240702165007-0020" - }, - "properties": [] - } - ] - }, - "gridPos": { - "h": 7, - "w": 15, - "x": 8, - "y": 7 - }, - "id": 7, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "bdqg3svckp9fkd" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(application_id) (metrics_executor_failedTasks_total)", - "format": "time_series", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "failed", - "range": true, - "refId": "A", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "bdqg3svckp9fkd" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(application_id) (metrics_executor_activeTasks)", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "active", - "range": true, - "refId": "B", - "useBackend": false - }, - { - "datasource": { - "type": "prometheus", - "uid": "bdqg3svckp9fkd" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum by(application_id) (metrics_executor_completedTasks_total)", - "fullMetaSearch": false, - "hide": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "completed", - "range": true, - "refId": "C", - "useBackend": false - } - ], - "title": "Tasks completed vs failed", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus", - "uid": "bdqg3svckp9fkd" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisBorderShow": false, - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 0, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "insertNulls": false, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "none" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 10, - "w": 23, - "x": 0, - "y": 14 - }, - "id": 9, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus", - "uid": "bdqg3svckp9fkd" - }, - "disableTextWrap": false, - "editorMode": "builder", - "expr": "sum(metrics_worker_memFree_MB_Value)", - "fullMetaSearch": false, - "includeNullMetadata": true, - "instant": false, - "legendFormat": "__auto", - "range": true, - "refId": "A", - "useBackend": false - } - ], - "title": "Panel Title", - "type": "timeseries" - } - ], - "schemaVersion": 39, - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-15m", - "to": "now" - }, - "timepicker": {}, - "timezone": "browser", - "title": "Spark Cluster metrics Demo2", - "uid": "bdqiuxl7bh98gc", - "version": 4, - "weekStart": "" -} \ No newline at end of file diff --git a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_03__monitor_your_app_with_grafana/spark-submit.sh b/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_03__monitor_your_app_with_grafana/spark-submit.sh deleted file mode 100644 index d197cfa..0000000 --- a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video_03__monitor_your_app_with_grafana/spark-submit.sh +++ /dev/null @@ -1,21 +0,0 @@ - spark/bin/spark-submit \ - --class com.codely.lesson_06_spark_streaming_kafka.video_02__deploy_application.DeploySparkApp \ - --deploy-mode client \ - --master spark://spark-master:7077 \ - --conf spark.sql.uris=thrift://hive-metastore:9083 \ - --conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension \ - --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog \ - --conf spark.hadoop.fs.s3a.access.key=test \ - --conf spark.hadoop.fs.s3a.secret.key=test \ - --conf spark.hadoop.fs.s3a.endpoint=http://s3-storage:4566 \ - --conf spark.hadoop.fs.s3a.path.style.access=true \ - --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ - --conf spark.driver.memory=1g \ - --conf spark.executor.memory=1g \ - --conf spark.executor.cores=1 \ - --conf spark.ui.prometheus.enabled=true \ - --conf spark.executor.processTreeMetrics.enabled=true \ - --conf spark.eventLog.logStageExecutorMetrics=true \ - --verbose \ - --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,io.delta:delta-spark_2.12:3.1.0,org.apache.hadoop:hadoop-aws:3.3.1,com.amazonaws:aws-java-sdk-bundle:1.11.375 \ - spark-apps/spark-for-programmers-course-assembly-0.1.0-SNAPSHOT.jar \ No newline at end of file diff --git a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video__04_join_optimization/JoinOptimizationApp.scala b/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video__04_join_optimization/JoinOptimizationApp.scala deleted file mode 100644 index a1021d1..0000000 --- a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video__04_join_optimization/JoinOptimizationApp.scala +++ /dev/null @@ -1,30 +0,0 @@ -package com.codely.lesson_09_basics_spark_optimization.video__04_join_optimization - -import org.apache.spark.sql.functions.broadcast - -object JoinOptimizationApp extends SparkApp { - - spark.sparkContext.setLogLevel("WARN") - - import spark.implicits._ - - val dataFrame1 = - Seq((1, "Alice", 50), (2, "Bob", 80), (3, "Javi", 99)) - .toDF("id", "name", "score") - - val largeDataFrame = spark - .range(1, 100000000L) - .map(i => (i, s"Name$i")) - .toDF("id", "other") - - /* val result = largeDataFrame.join(dataFrame1, "id") - result.explain() - result.show()*/ - - val result = largeDataFrame.join(broadcast(dataFrame1), "id") - result.explain() - result.show() - - Thread.sleep(1000000) - -} diff --git a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video__04_join_optimization/SparkApp.scala b/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video__04_join_optimization/SparkApp.scala deleted file mode 100644 index 8c3fb13..0000000 --- a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/video__04_join_optimization/SparkApp.scala +++ /dev/null @@ -1,10 +0,0 @@ -package com.codely.lesson_09_basics_spark_optimization.video__04_join_optimization - -trait SparkApp extends App { - val spark = org.apache.spark.sql.SparkSession.builder - .master("local[8]") - .appName("Spark Example") - //.config("spark.sql.autoBroadcastJoinThreshold", -1) - .config("spark.sql.adaptive.enabled", "false") - .getOrCreate() -} diff --git a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/z_practical_exercise/QueryPlanExercise.scala b/src/main/scala/com/codely/lesson_09_basics_spark_optimization/z_practical_exercise/QueryPlanExercise.scala deleted file mode 100644 index 4b7c566..0000000 --- a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/z_practical_exercise/QueryPlanExercise.scala +++ /dev/null @@ -1,25 +0,0 @@ -package com.codely.lesson_09_basics_spark_optimization.z_practical_exercise - -import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.functions.col - -object QueryPlanExercise extends App { - - val spark = SparkSession - .builder() - .appName("AvgSpendingCalculation") - .master("local[*]") - .getOrCreate() - - val filePath = - "src/main/scala/com/codely/lesson_09_basics_spark_optimization/z_practical_exercise/data/some_csv.csv" - - val rawData = spark.read.option("header", "true").csv(filePath) - - val filteredData = - rawData.filter(col("colA") === 1).selectExpr("upper(colB) as colB") - - filteredData.explain() - filteredData.show() - -} diff --git a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/z_practical_exercise/data/some_csv.csv b/src/main/scala/com/codely/lesson_09_basics_spark_optimization/z_practical_exercise/data/some_csv.csv deleted file mode 100644 index 1f87289..0000000 --- a/src/main/scala/com/codely/lesson_09_basics_spark_optimization/z_practical_exercise/data/some_csv.csv +++ /dev/null @@ -1,4 +0,0 @@ -colA,colB -1,"good" -1,"job" -2,"bad" \ No newline at end of file diff --git a/src/test/resources/application.conf b/src/test/resources/application.conf deleted file mode 100644 index 8f0527b..0000000 --- a/src/test/resources/application.conf +++ /dev/null @@ -1,25 +0,0 @@ - -avg-spending-app { - - spark { - appName = "AvgSpendingApp" - } - - jdbc { - url = "jdbc:postgresql://localhost:5432/test_database" - user = "admin" - password = "secret" - } - - kafka { - server = "kafka-server:9092" - topic = "kafka-topic" - } - - delta { - path = "tmp/delta" - checkpointPath = "tmp/checkpoint" - } -} - - diff --git a/src/test/resources/init_scripts.sql b/src/test/resources/init_scripts.sql deleted file mode 100644 index 7b49ab0..0000000 --- a/src/test/resources/init_scripts.sql +++ /dev/null @@ -1,13 +0,0 @@ - -CREATE TABLE example_table ( - name VARCHAR(10), - value INT -); - -INSERT INTO example_table (name, value) -VALUES - ('Alice', 10), - ('Bob', 20), - ('Alice', 20), - ('Charlie', 20), - ('Charlie', 30); \ No newline at end of file diff --git a/src/test/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/AvgSpendingAppTest.scala b/src/test/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/AvgSpendingAppTest.scala deleted file mode 100644 index 0572fde..0000000 --- a/src/test/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/AvgSpendingAppTest.scala +++ /dev/null @@ -1,46 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.app - -import com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.job.AvgSpendingJob -import com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.service.{DeltaTableWriter, JDBCReader} -import com.dimafeng.testcontainers.{ForAllTestContainer, PostgreSQLContainer} -import com.typesafe.config.{Config, ConfigFactory, ConfigValueFactory} - -class AvgSpendingAppTest extends SparkTestHelper with ForAllTestContainer { - - val reader = new JDBCReader - val writer = new DeltaTableWriter - - override val container: PostgreSQLContainer = { - PostgreSQLContainer().configure { c => - c.withInitScript("init_scripts.sql") - c.withDatabaseName("test-database") - c.withUsername("admin") - c.withPassword("secret") - } - } - - "AvgSpendingApp" should "process messages from Kafka and write results to Delta Lake" in { - - val config: Config = getTestingConfig - - AvgSpendingJob(config, reader, writer).run() - - val result = spark.read.format("delta").load(config.getString("delta.path")) - result.show() - - import testSQLImplicits._ - - val expected = Seq(("Charlie", 50), ("Bob", 20), ("Alice", 30)).toDF( - "name", - "total_spending" - ) - assert(result.collect() sameElements expected.collect()) - } - - private def getTestingConfig: Config = { - ConfigFactory - .load() - .getConfig("avg-spending-app") - .withValue("jdbc.url", ConfigValueFactory.fromAnyRef(container.jdbcUrl)) - } -} diff --git a/src/test/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/SparkTestHelper.scala b/src/test/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/SparkTestHelper.scala deleted file mode 100644 index 0703c90..0000000 --- a/src/test/scala/com/codely/lesson_08_tests_in_spark/video_01__end_to_end_testing/app/SparkTestHelper.scala +++ /dev/null @@ -1,77 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_01__end_to_end_testing.app - -import org.apache.commons.io.FileUtils -import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession} -import org.apache.spark.{SparkConf, SparkContext} -import org.scalatest.flatspec.AnyFlatSpec -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} - -import java.io.File -import java.nio.file.Files -import scala.reflect.io.Directory - -trait SparkTestHelper - extends AnyFlatSpec - with BeforeAndAfterEach - with BeforeAndAfterAll { - - private val sparkSession = SparkSession - .builder() - .master("local[*]") - .appName("test-spark-session") - .config(sparkConfiguration) - .enableHiveSupport() - .getOrCreate() - - protected var tempDir: String = _ - - protected implicit def spark: SparkSession = sparkSession - - protected def sc: SparkContext = sparkSession.sparkContext - - protected def sparkConfiguration: SparkConf = - new SparkConf() - .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .set( - "spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog" - ) - - override protected def beforeAll(): Unit = { - super.beforeAll() - clearTemporaryDirectories() - } - - override protected def beforeEach(): Unit = { - super.beforeEach() - tempDir = Files.createTempDirectory(this.getClass.toString).toString - } - - override protected def afterAll(): Unit = { - super.afterAll() - sparkSession.stop() - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - clearTemporaryDirectories() - } - - override protected def afterEach(): Unit = { - super.afterEach() - new Directory(new File(tempDir)).deleteRecursively() - spark.sharedState.cacheManager.clearCache() - spark.sessionState.catalog.reset() - } - - protected object testSQLImplicits extends SQLImplicits { - protected override def _sqlContext: SQLContext = sparkSession.sqlContext - } - - private def clearTemporaryDirectories(): Unit = { - val warehousePath = new File("spark-warehouse").getAbsolutePath - FileUtils.deleteDirectory(new File(warehousePath)) - - val metastoreDbPath = new File("metastore_db").getAbsolutePath - FileUtils.deleteDirectory(new File(metastoreDbPath)) - } - -} diff --git a/src/test/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/AvgSpendingTest.scala b/src/test/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/AvgSpendingTest.scala deleted file mode 100644 index 9abbd0d..0000000 --- a/src/test/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/AvgSpendingTest.scala +++ /dev/null @@ -1,60 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.job - -import com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.service.AvgSpending -import org.apache.spark.sql.{Row, SQLContext} -import org.apache.spark.sql.execution.streaming.MemoryStream - -class AvgSpendingTest extends SparkTestHelper { - - "AvgSpendingJob" should "calculate average spending correctly" in { - - import testSQLImplicits._ - implicit val sqlCtx: SQLContext = spark.sqlContext - - val events = MemoryStream[String] - val sessions = events.toDS - assert(sessions.isStreaming, "sessions must be a streaming Dataset") - - val transformedSessions = AvgSpending.calculate(sessions.toDF()) - - val streamingQuery = transformedSessions.writeStream - .format("memory") - .queryName("queryName") - .outputMode("complete") - .start - - val offset = events.addData(AvgSpendingTest.testPurchase) - - streamingQuery.processAllAvailable() - events.commit(offset) - - val result = spark.sql("select * from queryName") - result.show() - assert( - result.collect().head === Row("user456", "Electronics", 6, 599.98) - ) - } -} - -object AvgSpendingTest { - - val testPurchase: String = - """ - |{ - | "eventType": "purchase", - | "timestamp": "2024-06-28T14:35:00Z", - | "userId": "user456", - | "transactionId": "trans789", - | "products": [ - | { - | "productId": "prod123", - | "quantity": 2, - | "description": "Sample product description", - | "category": "Electronics", - | "price": 299.99 - | } - | ], - | "eventId": "event012" - |} - |""".stripMargin -} diff --git a/src/test/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/SparkTestHelper.scala b/src/test/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/SparkTestHelper.scala deleted file mode 100644 index 83072af..0000000 --- a/src/test/scala/com/codely/lesson_08_tests_in_spark/video_02__unit_testing/scala/job/SparkTestHelper.scala +++ /dev/null @@ -1,77 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.video_02__unit_testing.scala.job - -import org.apache.commons.io.FileUtils -import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession} -import org.apache.spark.{SparkConf, SparkContext} -import org.scalatest.flatspec.AnyFlatSpec -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} - -import java.io.File -import java.nio.file.Files -import scala.reflect.io.Directory - -trait SparkTestHelper - extends AnyFlatSpec - with BeforeAndAfterEach - with BeforeAndAfterAll { - - private val sparkSession = SparkSession - .builder() - .master("local[*]") - .appName("test-spark-session") - .config(sparkConfiguration) - .enableHiveSupport() - .getOrCreate() - - protected var tempDir: String = _ - - protected implicit def spark: SparkSession = sparkSession - - protected def sc: SparkContext = sparkSession.sparkContext - - protected def sparkConfiguration: SparkConf = - new SparkConf() - .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .set( - "spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog" - ) - - override protected def beforeAll(): Unit = { - super.beforeAll() - clearTemporaryDirectories() - } - - override protected def beforeEach(): Unit = { - super.beforeEach() - tempDir = Files.createTempDirectory(this.getClass.toString).toString - } - - override protected def afterAll(): Unit = { - super.afterAll() - sparkSession.stop() - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - clearTemporaryDirectories() - } - - override protected def afterEach(): Unit = { - super.afterEach() - new Directory(new File(tempDir)).deleteRecursively() - spark.sharedState.cacheManager.clearCache() - spark.sessionState.catalog.reset() - } - - protected object testSQLImplicits extends SQLImplicits { - protected override def _sqlContext: SQLContext = sparkSession.sqlContext - } - - private def clearTemporaryDirectories(): Unit = { - val warehousePath = new File("spark-warehouse").getAbsolutePath - FileUtils.deleteDirectory(new File(warehousePath)) - - val metastoreDbPath = new File("metastore_db").getAbsolutePath - FileUtils.deleteDirectory(new File(metastoreDbPath)) - } - -} diff --git a/src/test/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/AvgSpendingJobTest.scala b/src/test/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/AvgSpendingJobTest.scala deleted file mode 100644 index 3615852..0000000 --- a/src/test/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/AvgSpendingJobTest.scala +++ /dev/null @@ -1,79 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.z_practical_exercise.job - -import com.codely.lesson_08_tests_in_spark.z_practical_exercise.service.{AvgSpendingCalculator, DeltaTableWriter, kafkaReader} -import com.typesafe.config.{Config, ConfigFactory} -import org.apache.spark.sql.{DataFrame, functions} -import org.apache.spark.sql.functions.{avg, lit} -import org.apache.spark.sql.streaming.StreamingQuery -import org.mockito.{ArgumentCaptor, ArgumentMatchers, ArgumentMatchersSugar, MockitoSugar} - -class AvgSpendingJobTest extends SparkTestHelper with MockitoSugar { - - private val mockReader = mock[kafkaReader] - private val mockWriter = mock[DeltaTableWriter] - private implicit val fakeCalculator: AvgSpendingCalculator[DataFrame] = - FakeAvgSpendingCalculator - - "AvgSpendingJob" should "calculate the average spending per user" in { - - import testSQLImplicits._ - val config = getTestingConfig - - val data = Seq( - ("user1", "2024-01-01T12:00:00Z", "Electronics", 100), - ("user1", "2024-01-01T12:00:00Z", "Electronics", 200), - ("user2", "2024-01-01T12:00:00Z", "Books", 50) - ).toDF("userId", "timestamp", "category", "price") - - when(mockReader.readFromKafka("kafka-server:9092", "kafka-topic")) - .thenReturn(data) - - val mockQuery = mock[StreamingQuery] - doNothing.when(mockQuery).awaitTermination() - - val dataFrameCaptor: ArgumentCaptor[DataFrame] = ArgumentCaptor.forClass(classOf[DataFrame]) - - when( - mockWriter.writeToDeltaTable( - dataFrameCaptor.capture(), - ArgumentMatchersSugar.any[String], - ArgumentMatchersSugar.any[String] - ) - ).thenReturn(mockQuery) - - val job = AvgSpendingJob(config, mockReader, mockWriter) - - job.run() - - val expectedData = Seq( - ("user1", "Electronics", 300), - ("user2", "Books", 50) - ).toDF("userId", "category", "AvgSpending") - - val capturedData = dataFrameCaptor.getValue - assert(expectedData.collect() sameElements capturedData.collect()) - - verify(mockWriter).writeToDeltaTable( - ArgumentMatchers.eq(capturedData), - org.mockito.ArgumentMatchers.eq("tmp/delta"), - org.mockito.ArgumentMatchers.eq("tmp/checkpoint") - ) - } - - private def getTestingConfig: Config = { - ConfigFactory - .load() - .getConfig("avg-spending-app") - } - -} - -object FakeAvgSpendingCalculator extends AvgSpendingCalculator[DataFrame] { - - override def calculate(dataFrame: DataFrame): DataFrame = { - dataFrame - .groupBy("userId", "category") - .agg(functions.sum("price").alias("AvgSpending")) - .select("userId", "category", "AvgSpending") - } -} diff --git a/src/test/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/SparkTestHelper.scala b/src/test/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/SparkTestHelper.scala deleted file mode 100644 index e6c5f71..0000000 --- a/src/test/scala/com/codely/lesson_08_tests_in_spark/z_practical_exercise/job/SparkTestHelper.scala +++ /dev/null @@ -1,77 +0,0 @@ -package com.codely.lesson_08_tests_in_spark.z_practical_exercise.job - -import org.apache.commons.io.FileUtils -import org.apache.spark.sql.{SQLContext, SQLImplicits, SparkSession} -import org.apache.spark.{SparkConf, SparkContext} -import org.scalatest.flatspec.AnyFlatSpec -import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} - -import java.io.File -import java.nio.file.Files -import scala.reflect.io.Directory - -trait SparkTestHelper - extends AnyFlatSpec - with BeforeAndAfterEach - with BeforeAndAfterAll { - - private val sparkSession = SparkSession - .builder() - .master("local[*]") - .appName("test-spark-session") - .config(sparkConfiguration) - .enableHiveSupport() - .getOrCreate() - - protected var tempDir: String = _ - - protected implicit def spark: SparkSession = sparkSession - - protected def sc: SparkContext = sparkSession.sparkContext - - protected def sparkConfiguration: SparkConf = - new SparkConf() - .set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") - .set( - "spark.sql.catalog.spark_catalog", - "org.apache.spark.sql.delta.catalog.DeltaCatalog" - ) - - override protected def beforeAll(): Unit = { - super.beforeAll() - clearTemporaryDirectories() - } - - override protected def beforeEach(): Unit = { - super.beforeEach() - tempDir = Files.createTempDirectory(this.getClass.toString).toString - } - - override protected def afterAll(): Unit = { - super.afterAll() - sparkSession.stop() - SparkSession.clearActiveSession() - SparkSession.clearDefaultSession() - clearTemporaryDirectories() - } - - override protected def afterEach(): Unit = { - super.afterEach() - new Directory(new File(tempDir)).deleteRecursively() - spark.sharedState.cacheManager.clearCache() - spark.sessionState.catalog.reset() - } - - protected object testSQLImplicits extends SQLImplicits { - protected override def _sqlContext: SQLContext = sparkSession.sqlContext - } - - private def clearTemporaryDirectories(): Unit = { - val warehousePath = new File("spark-warehouse").getAbsolutePath - FileUtils.deleteDirectory(new File(warehousePath)) - - val metastoreDbPath = new File("metastore_db").getAbsolutePath - FileUtils.deleteDirectory(new File(metastoreDbPath)) - } - -}