forked from soumilshah1995/DebeziumFlinkHudiSync
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSubmit Spark Job
49 lines (38 loc) · 2.51 KB
/
Submit Spark Job
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# =============== AVRO MESSSAGES==============================
spark-submit \
--class org.apache.hudi.utilities.streamer.HoodieStreamer \
--packages 'org.apache.hudi:hudi-spark3.4-bundle_2.12:0.14.0,org.apache.hadoop:hadoop-aws:3.3.2' \
--properties-file spark-config.properties \
--master 'local[*]' \
--executor-memory 1g \
--jars /Users/soumilshah/IdeaProjects/SparkProject/DeltaStreamer/jar/hudi-extensions-0.1.0-SNAPSHOT-bundled.jar,/Users/soumilshah/IdeaProjects/SparkProject/DeltaStreamer/jar/hudi-java-client-0.14.0.jar \
/Users/soumilshah/IdeaProjects/SparkProject/DeltaStreamer/jar/hudi-utilities-slim-bundle_2.12-0.14.0.jar \
--table-type COPY_ON_WRITE \
--target-base-path 's3a://warehouse/database=default/table_name=processed_orders' \
--target-table processed_orders \
--op UPSERT \
--enable-hive-sync \
--source-limit 4000000 \
--source-class org.apache.hudi.utilities.sources.AvroKafkaSource \
--schemaprovider-class org.apache.hudi.utilities.schema.SchemaRegistryProvider \
--min-sync-interval-seconds 60 \
--continuous \
--source-ordering-field order_number \
--hoodie-conf bootstrap.servers=localhost:7092 \
--hoodie-conf schema.registry.url=http://localhost:8081 \
--hoodie-conf hoodie.deltastreamer.schemaprovider.registry.url=http://localhost:8081/subjects/processed_orders-value/versions/latest \
--hoodie-conf hoodie.deltastreamer.source.kafka.value.deserializer.class=io.confluent.kafka.serializers.KafkaAvroDeserializer \
--hoodie-conf 'hoodie.deltastreamer.schemaprovider.registry.schemaconverter=''''' \
--hoodie-conf hoodie.deltastreamer.source.kafka.topic=processed_orders \
--hoodie-conf auto.offset.reset=earliest \
--hoodie-conf hoodie.datasource.write.recordkey.field=order_number \
--hoodie-conf 'hoodie.datasource.write.partitionpath.field=' \
--hoodie-conf hoodie.datasource.write.precombine.field=order_number \
--hoodie-conf 'hoodie.datasource.hive_sync.partition_extractor_class=org.apache.hudi.hive.MultiPartKeysValueExtractor' \
--hoodie-conf 'hoodie.datasource.hive_sync.metastore.uris=thrift://localhost:9083' \
--hoodie-conf 'hoodie.datasource.hive_sync.mode=hms' \
--hoodie-conf 'hoodie.datasource.hive_sync.enable=true' \
--hoodie-conf 'hoodie.datasource.hive_sync.database=default' \
--hoodie-conf 'hoodie.datasource.hive_sync.table=processed_orders' \
--hoodie-conf 'hoodie.datasource.write.hive_style_partitioning=true'
# =============================================