From 0c17971d5af758859732f56ea0945a1d68eb4fa1 Mon Sep 17 00:00:00 2001 From: Arkadiusz Komarzewski Date: Mon, 2 Dec 2019 15:02:57 +0100 Subject: [PATCH] Bug 1574490 - Add support for reading from BigQuery views to EventsToAmplitude job This adds support to read from views introduced in https://github.com/mozilla/bigquery-etl/pull/510. This can be used as an alternative solution if S3 ingestion takes too long to set up. --- .../streaming/EventsToAmplitude.scala | 84 ++++++++++++++----- 1 file changed, 63 insertions(+), 21 deletions(-) diff --git a/src/main/scala/com/mozilla/telemetry/streaming/EventsToAmplitude.scala b/src/main/scala/com/mozilla/telemetry/streaming/EventsToAmplitude.scala index b1aaf149..9c49a786 100644 --- a/src/main/scala/com/mozilla/telemetry/streaming/EventsToAmplitude.scala +++ b/src/main/scala/com/mozilla/telemetry/streaming/EventsToAmplitude.scala @@ -99,6 +99,14 @@ object EventsToAmplitude extends StreamingJobBase { descr = "In batch mode, pings will be packed into maxParallelRequests * multiplier partitions", required = false, default = Some(1)) + val dataSource: ScallopOption[String] = opt[String]( + descr = "Data source for batch mode: `heka` or `bigquery`", + required = false, + default = Some("heka")) + val bqSourceTable: ScallopOption[String] = opt[String]( + descr = "Source table, used when dataSource=='bigquery'", + required = false, + default = None) conflicts(kafkaBroker, List(from, to, fileLimit, minDelay, maxParallelRequests)) validateOpt (sample) { @@ -235,7 +243,6 @@ object EventsToAmplitude extends StreamingJobBase { } def sendBatchEvents(spark: SparkSession, opts: Opts): Unit = { - val config = readConfigFile(opts.configFilePath()) val maxParallelRequests = opts.maxParallelRequests() val partitionMultiplier = opts.partitionMultiplier() @@ -247,39 +254,74 @@ object EventsToAmplitude extends StreamingJobBase { val httpSinkConfig = AmplitudeHttpSink.Config.withMetrics(spark) val httpSink = AmplitudeHttpSink(apiKey = apiKey, url = opts.url(), httpSinkConfig) + if (opts.dataSource.get.contains("bigquery")) { + // ignore date, read from filtered table populated in Airflow DAG + + val rawEvents = spark.read.format("bigquery") + .option("table", opts.bqSourceTable.get.get) + .load() + + val events_json = rawEvents + .withColumn("event_json_escaped", + f.to_json(f.struct( + f.col("device_id"), f.col("session_id"), f.col("insert_id"), + f.col("event_type"), f.col("time"), f.col("event_properties"), + f.col("user_properties"), f.col("app_version"), f.col("os_name"), + f.col("os_version"), f.col("country"), f.col("city") + ))) + .withColumn("event_json", + f.regexp_replace(f.regexp_replace( + f.regexp_replace(f.col("event_json_escaped"), "\\\\\"", "\""), + "\"\\{", "{"), "\\}\"", "}")) + .select("device_id", "event_json") + + log.info("Sending to Amplitude...") + import spark.implicits._ + events_json.repartition(maxParallelRequests, f.col("device_id")) // Bug 1484819 + .select(f.array(f.col("event_json"))) + .as[Seq[String]] + .foreachPartition { it => + httpSink.batchAndProcess(it) + java.lang.Thread.sleep(minDelay) + } + log.info("Done!") + } else { + val config = readConfigFile(opts.configFilePath()) - datesBetween(opts.from(), opts.to.get).foreach { currentDate => - val dataset = com.mozilla.telemetry.heka.Dataset(config.source) - val topLevelFields = TOP_LEVEL_PING_FIELDS(config.source) + datesBetween(opts.from(), opts.to.get).foreach { currentDate => - val pings = config.getBatchFilters.filter{ - case(name, _) => topLevelFields.contains(name) - }.foldLeft(dataset){ - case(d, (key, values)) => d.where(key) { + val dataset = com.mozilla.telemetry.heka.Dataset(config.source) + val topLevelFields = TOP_LEVEL_PING_FIELDS(config.source) + + val pings = config.getBatchFilters.filter { + case (name, _) => topLevelFields.contains(name) + }.foldLeft(dataset) { + case (d, (key, values)) => d.where(key) { case v if values.contains(v) => true } }.where("submissionDate") { case date if date == currentDate => true }.records(opts.fileLimit.get, Some(maxParallelRequests * partitionMultiplier)) - .map(m => Row(m.toByteArray)) + .map(m => Row(m.toByteArray)) - val schema = StructType(List( + val schema = StructType(List( StructField("value", BinaryType, true) - )) + )) - val pingsDataFrame = spark.createDataFrame(pings, schema) + val pingsDataFrame = spark.createDataFrame(pings, schema) - log.info(s"Processing events for ${pingsDataFrame.count()} pings on $currentDate") + log.info(s"Processing events for ${pingsDataFrame.count()} pings on $currentDate") - import spark.implicits._ + import spark.implicits._ - getEvents(config, pingsDataFrame, opts.sample(), opts.raiseOnError()) - .repartition(maxParallelRequests, f.col("clientId")) // Bug 1484819 - .map(_.events) - .foreachPartition { it => - httpSink.batchAndProcess(it) - java.lang.Thread.sleep(minDelay) - } + getEvents(config, pingsDataFrame, opts.sample(), opts.raiseOnError()) + .repartition(maxParallelRequests, f.col("clientId")) // Bug 1484819 + .map(_.events) + .foreachPartition { it => + httpSink.batchAndProcess(it) + java.lang.Thread.sleep(minDelay) + } + } } spark.stop()