From 2398eea91bb27a93aec1d4b0eb225359800c989c Mon Sep 17 00:00:00 2001 From: Bhavani Sudha Saktheeswaran <2179254+bhasudha@users.noreply.github.com> Date: Fri, 21 Jul 2023 07:11:44 -0700 Subject: [PATCH 1/3] [HUDI-6112] Fix bugs in Doc Generation tool - Add Config Param in Description - Styling changes to fix table size and toc on side for better navigation - Bug fix in basic configs page to merge spark datasource related read and write configs --- .../hudi/utils/HoodieConfigDocGenerator.java | 87 +- website/docs/basic_configurations.md | 457 ++-- website/docs/configurations.md | 2242 +++++++---------- website/src/css/custom.css | 17 +- 4 files changed, 1102 insertions(+), 1701 deletions(-) diff --git a/hudi-utils/src/main/java/org/apache/hudi/utils/HoodieConfigDocGenerator.java b/hudi-utils/src/main/java/org/apache/hudi/utils/HoodieConfigDocGenerator.java index fc34ed17abff..b27103f6cd79 100644 --- a/hudi-utils/src/main/java/org/apache/hudi/utils/HoodieConfigDocGenerator.java +++ b/hudi-utils/src/main/java/org/apache/hudi/utils/HoodieConfigDocGenerator.java @@ -19,6 +19,7 @@ package org.apache.hudi.utils; +import com.sun.tools.doclets.internal.toolkit.NestedClassWriter; import org.apache.hudi.common.config.*; import org.apache.hudi.common.config.ConfigGroups.Names; import org.apache.hudi.common.config.ConfigGroups.SubGroupNames; @@ -79,7 +80,7 @@ public class HoodieConfigDocGenerator { private static final String DEFAULT_FOOTER_MARKUP = new StringBuilder().append(NEWLINE).append(new HorizontalRule(3)).append(DOUBLE_NEWLINE).toString(); private static final Integer DEFAULT_CONFIG_GROUP_HEADING_LEVEL = 2; private static final Integer DEFAULT_CONFIG_PARAM_HEADING_LEVEL = 3; - private static final TableRow DEFAULT_TABLE_HEADER_ROW = new TableRow<>(new ArrayList<>(Arrays.asList("Config Name", "Default", "Description", "Since Version"))); + private static final TableRow DEFAULT_TABLE_HEADER_ROW = new TableRow<>(new ArrayList<>(Arrays.asList("Config Name", "Default", "Description"))); public static void main(String[] args) { Reflections reflections = new Reflections("org.apache.hudi"); @@ -121,13 +122,9 @@ private static void generateAllConfigsHeader(StringBuilder builder) { keywords: [configurations, default, flink options, spark, configs, parameters] permalink: /docs/configurations.html summary: This section offers an overview of tools available to operate an ecosystem of Hudi + toc_min_heading_level: 2 + toc_max_heading_level: 4 last_modified_at: 2019-12-30T15:59:57-04:00 - hide_table_of_contents: true - --- - import TOCInline from '@theme/TOCInline'; - - - --- */ LocalDateTime now = LocalDateTime.now(); @@ -136,14 +133,10 @@ private static void generateAllConfigsHeader(StringBuilder builder) { .append("keywords: [ configurations, default, flink options, spark, configs, parameters ] ").append(NEWLINE) .append("permalink: /docs/configurations.html").append(NEWLINE) .append("summary: " + ALL_CONFIGS_PAGE_SUMMARY).append(NEWLINE) + .append("toc_min_heading_level: 2").append(NEWLINE) + .append("toc_max_heading_level: 4").append(NEWLINE) .append("last_modified_at: " + DateTimeFormatter.ISO_DATE_TIME.format(now)).append(NEWLINE) - .append("hide_table_of_contents: true").append(NEWLINE) .append(new HorizontalRule()).append(NEWLINE) - .append("import TOCInline from '@theme/TOCInline';") - .append(DOUBLE_NEWLINE) - .append("") - .append(DOUBLE_NEWLINE) - .append(new HorizontalRule()) .append(DOUBLE_NEWLINE); // Description builder.append(ALL_CONFIGS_PAGE_SUMMARY).append(DOUBLE_NEWLINE); @@ -160,12 +153,6 @@ private static void generateBasicConfigsHeader(StringBuilder builder) { features a subset of the most frequently used configurations. For a full list of all configs, please visit the [All Configurations](/docs/configurations) page. last_modified_at: 2019-12-30T15:59:57-04:00 - hide_table_of_contents: true - --- - import TOCInline from '@theme/TOCInline'; - - - --- */ LocalDateTime now = LocalDateTime.now(); @@ -173,13 +160,7 @@ private static void generateBasicConfigsHeader(StringBuilder builder) { .append("title: ").append("Basic Configurations").append(NEWLINE) .append("summary: " + BASIC_CONFIGS_PAGE_SUMMARY).append(NEWLINE) .append("last_modified_at: " + DateTimeFormatter.ISO_DATE_TIME.format(now)).append(NEWLINE) - .append("hide_table_of_contents: true").append(NEWLINE) .append(new HorizontalRule()).append(NEWLINE) - .append("import TOCInline from '@theme/TOCInline';") - .append(DOUBLE_NEWLINE) - .append("") - .append(DOUBLE_NEWLINE) - .append(new HorizontalRule()) .append(DOUBLE_NEWLINE); // Description builder.append(BASIC_CONFIGS_PAGE_SUMMARY).append(DOUBLE_NEWLINE); @@ -214,20 +195,20 @@ private static ConfigTableRow generateConfigTableRow(Class subType, Field field, boolean isConfigRequired = (defaultValue == null); // Description + String configParam = "`Config Param: " + field.getName() + "`"; String description = StringUtils.isNullOrEmpty(cfgProperty.doc()) ? "" : cfgProperty.doc().replaceAll("[\\t\\n\\r]+", " ").replaceAll("&", "&").replaceAll("\\|", " | ").replaceAll("<", "<").replaceAll(">", ">"); - columns.add(description); // First version + String versionInfo = ""; if (cfgProperty.getSinceVersion().isPresent()) { - String sinceVersion = String.valueOf(cfgProperty.getSinceVersion().get()); + String sinceVersion = "
`Since Version: " + cfgProperty.getSinceVersion().get() + "`"; String deprecatedVersion = ""; if (cfgProperty.getDeprecatedVersion().isPresent()) { - deprecatedVersion = ". Deprecated since: " + String.valueOf(cfgProperty.getDeprecatedVersion().get()); + deprecatedVersion = "
`Deprecated since: " + cfgProperty.getDeprecatedVersion().get() + "`"; } - columns.add(sinceVersion + deprecatedVersion); - } else { - columns.add(" "); + versionInfo = sinceVersion + deprecatedVersion; } + columns.add(description + "

" + configParam + versionInfo); return new ConfigTableRow(cfgProperty.key(), new TableRow<>(columns), isConfigRequired, cfgProperty.isAdvanced()); } catch (IllegalAccessException e) { @@ -465,10 +446,9 @@ private static void populateFlinkConfigs(Class subType, ConfigClassMarkups confi boolean isConfigRequired = (defaultValue == null); // Description - columns.add(StringUtils.isNullOrEmpty(description) ? "" : description.replaceAll("[\\t\\n\\r]+", " ")); - - // Since Version. this is empty since for Flink we dont have this info. - columns.add(" "); + String configParam = " `Config Param: " + field.getName() + "`"; + String desc = StringUtils.isNullOrEmpty(description) ? "" : description.replaceAll("[\\t\\n\\r]+", " "); + columns.add(desc + "

" + configParam); ConfigTableRow configRow = new ConfigTableRow(cfgProperty.key(), new TableRow<>(columns), isConfigRequired); @@ -578,33 +558,28 @@ private static void generateBasicConfigurationPages(NavigableMap inclusionList = EnumSet.noneOf(Names.class); // Iterate the Treemap and get all config groups and classes that have basic configs. Set keySet = configClassTreeMap.keySet(); - List basicConfigMarkups = new ArrayList<>(); + + Names prevGroupName = Names.ENVIRONMENT_CONFIG; + SubGroupNames prevSubGroupName = NONE; + StringBuilder stringBuilder = new StringBuilder(); for (ConfigClassMeta configClassMetaInfo : keySet) { ConfigClassMarkups configClassMarkup = configClassTreeMap.get(configClassMetaInfo); - if (configClassMarkup.basicConfigs != null) { - inclusionList.add(configClassMetaInfo.groupName); - basicConfigMarkups.add(configClassMarkup); + if (configClassMarkup.basicConfigs == null) { + continue; } - } - - String prevGroupSummary = null; - String prevSubGroupSummary = null; - StringBuilder stringBuilder = new StringBuilder(); - for(ConfigClassMarkups configClassMarkups: basicConfigMarkups) { - String currentGroupSummary = configClassMarkups.topLevelGroupSummary; - if (currentGroupSummary != prevGroupSummary) { - stringBuilder.append(NEWLINE).append(currentGroupSummary); - prevGroupSummary = currentGroupSummary; + inclusionList.add(configClassMetaInfo.groupName); + if(configClassMetaInfo.groupName != prevGroupName){ + stringBuilder.append(configClassMarkup.topLevelGroupSummary); + prevGroupName = configClassMetaInfo.groupName; } - String currentSubGroupSummary = configClassMarkups.topLevelSubGroupSummary; - if (currentSubGroupSummary != prevSubGroupSummary) { - if (currentSubGroupSummary != null) { - stringBuilder.append(NEWLINE).append(currentSubGroupSummary); + if(configClassMetaInfo.subGroupName != prevSubGroupName){ + if (configClassMetaInfo.subGroupName != NONE && configClassMarkup.topLevelSubGroupSummary != null){ + stringBuilder.append(NEWLINE).append(configClassMarkup.topLevelSubGroupSummary); } - prevSubGroupSummary = currentSubGroupSummary; + prevSubGroupName = configClassMetaInfo.subGroupName; } - stringBuilder.append(NEWLINE).append(configClassMarkups.configClassSummary); - stringBuilder.append(NEWLINE).append(configClassMarkups.basicConfigs); + stringBuilder.append(NEWLINE).append(configClassMarkup.configClassSummary); + stringBuilder.append(NEWLINE).append(configClassMarkup.basicConfigs); stringBuilder.append(DEFAULT_FOOTER_MARKUP); } generateMainHeadings(contentTableBuilder, EnumSet.complementOf(inclusionList)); diff --git a/website/docs/basic_configurations.md b/website/docs/basic_configurations.md index f329523e055d..8aa82bbd2567 100644 --- a/website/docs/basic_configurations.md +++ b/website/docs/basic_configurations.md @@ -1,25 +1,19 @@ --- title: Basic Configurations summary: This page covers the basic configurations you may use to write/read Hudi tables. This page only features a subset of the most frequently used configurations. For a full list of all configs, please visit the [All Configurations](/docs/configurations) page. -last_modified_at: 2023-07-07T17:00:30.473 -hide_table_of_contents: true +last_modified_at: 2023-07-21T07:02:09.459 --- -import TOCInline from '@theme/TOCInline'; - - ---- This page covers the basic configurations you may use to write/read Hudi tables. This page only features a subset of the most frequently used configurations. For a full list of all configs, please visit the [All Configurations](/docs/configurations) page. -- [**Spark Datasource Configs**](#SPARK_DATASOURCE): These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, specify the write operation, specify how to merge records or choosing query type to read. +- [**Spark Datasource Configs**](#SPARK_DATASOURCE): These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read. - [**Flink Sql Configs**](#FLINK_SQL): These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read. - [**Write Client Configs**](#WRITE_CLIENT): Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads. - [**Metastore and Catalog Sync Configs**](#META_SYNC): Configurations used by the Hudi to sync metadata to external metastores and catalogs. -- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of key Hudi stats and metrics. +- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of keyHudi stats and metrics. - [**Kafka Connect Configs**](#KAFKA_CONNECT): These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables -- [**DeltaStreamer Configs**](#DELTA_STREAMER): These set of configs are used for DeltaStreamer utility which provides the way to ingest from different sources such as DFS or Kafka. - +- [**Hudi Streamer Configs**](#HUDI_STREAMER): These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. ## Spark Datasource Configs {#SPARK_DATASOURCE} These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read. @@ -35,19 +29,15 @@ Options useful for reading tables via `read.format.option(...)` [**Basic Configs**](#Read-Options-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.read.begin.instanttime](#hoodiedatasourcereadbegininstanttime) | N/A **(Required)** | Required when `hoodie.datasource.query.type` is set to `incremental`. Represents the instant time to start incrementally pulling data from. The instanttime here need not necessarily correspond to an instant on the timeline. New data written with an instant_time > BEGIN_INSTANTTIME are fetched out. For e.g: ‘20170901080000’ will get all new data written after Sep 1, 2017 08:00AM. Note that if `hoodie.datasource.read.handle.hollow.commit` set to USE_STATE_TRANSITION_TIME, will use instant's `stateTransitionTime` to perform comparison. | | -| [hoodie.datasource.read.end.instanttime](#hoodiedatasourcereadendinstanttime) | N/A **(Required)** | Used when `hoodie.datasource.query.type` is set to `incremental`. Represents the instant time to limit incrementally fetched data to. When not specified latest commit time from timeline is assumed by default. When specified, new data written with an instant_time <= END_INSTANTTIME are fetched out. Point in time type queries makes more sense with begin and end instant times specified. Note that if `hoodie.datasource.read.handle.hollow.commit` set to `USE_STATE_TRANSITION_TIME`, will use instant's `stateTransitionTime` to perform comparison. | | -| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot (Optional) | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files) | | -| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | | +| Config Name | Default | Description | +| --------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.read.begin.instanttime](#hoodiedatasourcereadbegininstanttime) | N/A **(Required)** | Required when `hoodie.datasource.query.type` is set to `incremental`. Represents the instant time to start incrementally pulling data from. The instanttime here need not necessarily correspond to an instant on the timeline. New data written with an instant_time > BEGIN_INSTANTTIME are fetched out. For e.g: ‘20170901080000’ will get all new data written after Sep 1, 2017 08:00AM. Note that if `hoodie.datasource.read.handle.hollow.commit` set to USE_STATE_TRANSITION_TIME, will use instant's `stateTransitionTime` to perform comparison.

`Config Param: BEGIN_INSTANTTIME` | +| [hoodie.datasource.read.end.instanttime](#hoodiedatasourcereadendinstanttime) | N/A **(Required)** | Used when `hoodie.datasource.query.type` is set to `incremental`. Represents the instant time to limit incrementally fetched data to. When not specified latest commit time from timeline is assumed by default. When specified, new data written with an instant_time <= END_INSTANTTIME are fetched out. Point in time type queries make more sense with begin and end instant times specified. Note that if `hoodie.datasource.read.handle.hollow.commit` set to `USE_STATE_TRANSITION_TIME`, will use instant's `stateTransitionTime` to perform comparison.

`Config Param: END_INSTANTTIME` | +| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot (Optional) | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files)

`Config Param: QUERY_TYPE` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)

`Config Param: READ_PRE_COMBINE_FIELD` | --- -## Spark Datasource Configs {#SPARK_DATASOURCE} -These configs control the Hudi Spark Datasource, providing ability to define keys/partitioning, pick out the write operation, specify how to merge records or choosing query type to read. - - ### Write Options {#Write-Options} You can pass down any of the WriteClient level configs directly using `options()` or `option(k,v)` methods. @@ -72,26 +62,27 @@ Options useful for writing tables via `write.format.option(...)` [**Basic Configs**](#Write-Options-basic-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------------------------------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql. | | -| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | N/A **(Required)** | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString() | | -| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | N/A **(Required)** | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` | | -| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table. | 0.7.0 | -| [hoodie.clustering.inline](#hoodieclusteringinline) | false (Optional) | Turn on inline clustering - clustering will be run after each write operation is complete | 0.7.0 | -| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore. | | -| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url | | -| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url | | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | -| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values) | | -| [hoodie.datasource.write.operation](#hoodiedatasourcewriteoperation) | upsert (Optional) | Whether to do upsert, insert or bulk_insert for the write operation. Use bulk_insert to load new data into a table, and there on use upsert/insert. bulk insert uses a disk based write path to scale to load large inputs without need to cache it. | | -| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | | -| [hoodie.datasource.write.streaming.disable.compaction](#hoodiedatasourcewritestreamingdisablecompaction) | false (Optional) | By default for MOR table, async compaction is enabled with spark streaming sink. By setting this config to true, we can disable it and the expectation is that, users will schedule and execute compaction in a different process/job altogether. Some users may wish to run it separately to manage resources across table services and regular ingestion pipeline and so this could be preferred on such cases. | 0.14.0 | -| [hoodie.datasource.write.table.type](#hoodiedatasourcewritetabletype) | COPY_ON_WRITE (Optional) | The table type for the underlying data, for this write. This can’t change between writes. | | -| [hoodie.sql.insert.mode](#hoodiesqlinsertmode) | upsert (Optional) | Insert mode when insert data to pk-table. The optional modes are: upsert, strict and non-strict.For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record.For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record.While for non-strict mode, hudi just do the insert operation for the pk-table. | | +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.

`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | N/A **(Required)** | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString()

`Config Param: PARTITIONPATH_FIELD` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | N/A **(Required)** | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`

`Config Param: RECORDKEY_FIELD` | +| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table.

`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline](#hoodieclusteringinline) | false (Optional) | Turn on inline clustering - clustering will be run after each write operation is complete

`Config Param: INLINE_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore.

`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url

`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url

`Config Param: METASTORE_URIS` | +| [hoodie.datasource.insert.dup.policy](#hoodiedatasourceinsertduppolicy) | none (Optional) | When operation type is set to "insert", users can optionally enforce a dedup policy. This policy will be employed when records being ingested already exists in storage. Default policy is none and no action will be taken. Another option is to choose "drop", on which matching records from incoming will be dropped and the rest will be ingested. Third option is "fail" which will fail the write operation when same records are re-ingested. In other words, a given record as deduced by the key generation policy can be ingested only once to the target table of interest.

`Config Param: INSERT_DUP_POLICY` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)

`Config Param: HIVE_STYLE_PARTITIONING` | +| [hoodie.datasource.write.operation](#hoodiedatasourcewriteoperation) | upsert (Optional) | Whether to do upsert, insert or bulk_insert for the write operation. Use bulk_insert to load new data into a table, and there on use upsert/insert. bulk insert uses a disk based write path to scale to load large inputs without need to cache it.

`Config Param: OPERATION` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)

`Config Param: PRECOMBINE_FIELD` | +| [hoodie.datasource.write.streaming.disable.compaction](#hoodiedatasourcewritestreamingdisablecompaction) | false (Optional) | By default for MOR table, async compaction is enabled with spark streaming sink. By setting this config to true, we can disable it and the expectation is that, users will schedule and execute compaction in a different process/job altogether. Some users may wish to run it separately to manage resources across table services and regular ingestion pipeline and so this could be preferred on such cases.

`Config Param: STREAMING_DISABLE_COMPACTION`
`Since Version: 0.14.0` | +| [hoodie.datasource.write.table.type](#hoodiedatasourcewritetabletype) | COPY_ON_WRITE (Optional) | The table type for the underlying data, for this write. This can’t change between writes.

`Config Param: TABLE_TYPE` | +| [hoodie.sql.insert.mode](#hoodiesqlinsertmode) | upsert (Optional) | Insert mode when insert data to pk-table. The optional modes are: upsert, strict and non-strict.For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record.For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record.While for non-strict mode, hudi just do the insert operation for the pk-table. This config is deprecated as of 0.14.0. Please use hoodie.sql.write.operation and hoodie.datasource.insert.dup.policy as you see fit.

`Config Param: SQL_INSERT_MODE` | +| [hoodie.sql.write.operation](#hoodiesqlwriteoperation) | insert (Optional) | Sql write operation to use with INSERT_INTO spark sql command. This comes with 3 possible values, bulk_insert, insert and upsert. bulk_insert is generally meant for initial loads and is known to be performant compared to insert. But bulk_insert may not do small file managmeent. If you prefer hudi to automatically managee small files, then you can go with "insert". There is no precombine (if there are duplicates within the same batch being ingested, same dups will be ingested) with bulk_insert and insert and there is no index look up as well. If you may use INSERT_INTO for mutable dataset, then you may have to set this config value to "upsert". With upsert, you will get both precombine and updates to existing records on storage is also honored. If not, you may see duplicates.

`Config Param: SQL_WRITE_OPERATION` | --- - ## Flink Sql Configs {#FLINK_SQL} These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read. @@ -105,44 +96,43 @@ Flink jobs using the SQL can be configured through the options in WITH clause. T [**Basic Configs**](#Flink-Options-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------ | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.database.name](#hoodiedatabasename) | N/A **(Required)** | Database name to register to Hive metastore | | -| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name to register to Hive metastore | | -| [path](#path) | N/A **(Required)** | Base path for the target hoodie table. The path would be created if it does not exist, otherwise a Hoodie table expects to be initialized successfully | | -| [read.end-commit](#readend-commit) | N/A **(Required)** | End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss' | | -| [read.start-commit](#readstart-commit) | N/A **(Required)** | Start commit instant for reading, the commit time format should be 'yyyyMMddHHmmss', by default reading from the latest instant for streaming read | | -| [archive.max_commits](#archivemax_commits) | 50 (Optional) | Max number of commits to keep before archiving older commits into a sequential log, default 50 | | -| [archive.min_commits](#archivemin_commits) | 40 (Optional) | Min number of commits to keep before archiving older commits into a sequential log, default 40 | | -| [cdc.enabled](#cdcenabled) | false (Optional) | When enable, persist the change data if necessary, and can be queried as a CDC query mode | | -| [cdc.supplemental.logging.mode](#cdcsupplementalloggingmode) | DATA_BEFORE_AFTER (Optional) | Setting 'op_key_only' persists the 'op' and the record key only, setting 'data_before' persists the additional 'before' image, and setting 'data_before_after' persists the additional 'before' and 'after' images. | | -| [changelog.enabled](#changelogenabled) | false (Optional) | Whether to keep all the intermediate changes, we try to keep all the changes of a record when enabled: 1). The sink accept the UPDATE_BEFORE message; 2). The source try to emit every changes of a record. The semantics is best effort because the compaction job would finally merge all changes of a record into one. default false to have UPSERT semantics | | -| [clean.async.enabled](#cleanasyncenabled) | true (Optional) | Whether to cleanup the old commits immediately on new commits, enabled by default | | -| [clean.retain_commits](#cleanretain_commits) | 30 (Optional) | Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much you can incrementally pull on this table, default 30 | | -| [clustering.async.enabled](#clusteringasyncenabled) | false (Optional) | Async Clustering, default false | | -| [clustering.plan.strategy.small.file.limit](#clusteringplanstrategysmallfilelimit) | 600 (Optional) | Files smaller than the size specified here are candidates for clustering, default 600 MB | | -| [clustering.plan.strategy.target.file.max.bytes](#clusteringplanstrategytargetfilemaxbytes) | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB | | -| [compaction.async.enabled](#compactionasyncenabled) | true (Optional) | Async Compaction, enabled by default for MOR | | -| [compaction.delta_commits](#compactiondelta_commits) | 5 (Optional) | Max delta commits needed to trigger compaction, default 5 commits | | -| [hive_sync.enabled](#hive_syncenabled) | false (Optional) | Asynchronously sync Hive meta to HMS, default false | | -| [hive_sync.jdbc_url](#hive_syncjdbc_url) | jdbc:hive2://localhost:10000 (Optional) | Jdbc URL for hive sync, default 'jdbc:hive2://localhost:10000' | | -| [hive_sync.metastore.uris](#hive_syncmetastoreuris) | (Optional) | Metastore uris for hive sync, default '' | | -| [hive_sync.mode](#hive_syncmode) | HMS (Optional) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql, default 'hms' | | -| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot (Optional) | Decides how data files need to be read, in 1) Snapshot mode (obtain latest view, based on row & columnar data); 2) incremental mode (new data since an instantTime); 3) Read Optimized mode (obtain latest view, based on columnar data) .Default: snapshot | | -| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values) | | -| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | (Optional) | Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual value obtained by invoking .toString(), default '' | | -| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | uuid (Optional) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` | | -| [index.type](#indextype) | FLINK_STATE (Optional) | Index type of Flink write job, default is using state backed index. | | -| [metadata.compaction.delta_commits](#metadatacompactiondelta_commits) | 10 (Optional) | Max delta commits for metadata table to trigger compaction, default 10 | | -| [metadata.enabled](#metadataenabled) | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings, default enabled | | -| [precombine.field](#precombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | | -| [read.streaming.enabled](#readstreamingenabled) | false (Optional) | Whether to read as streaming source, default false | | -| [table.type](#tabletype) | COPY_ON_WRITE (Optional) | Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ | | -| [write.operation](#writeoperation) | upsert (Optional) | The write operation, that this write should do | | -| [write.parquet.max.file.size](#writeparquetmaxfilesize) | 120 (Optional) | Target size for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | --------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.database.name](#hoodiedatabasename) | N/A **(Required)** | Database name to register to Hive metastore

`Config Param: DATABASE_NAME` | +| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name to register to Hive metastore

`Config Param: TABLE_NAME` | +| [path](#path) | N/A **(Required)** | Base path for the target hoodie table. The path would be created if it does not exist, otherwise a Hoodie table expects to be initialized successfully

`Config Param: PATH` | +| [read.end-commit](#readend-commit) | N/A **(Required)** | End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss'

`Config Param: READ_END_COMMIT` | +| [read.start-commit](#readstart-commit) | N/A **(Required)** | Start commit instant for reading, the commit time format should be 'yyyyMMddHHmmss', by default reading from the latest instant for streaming read

`Config Param: READ_START_COMMIT` | +| [archive.max_commits](#archivemax_commits) | 50 (Optional) | Max number of commits to keep before archiving older commits into a sequential log, default 50

`Config Param: ARCHIVE_MAX_COMMITS` | +| [archive.min_commits](#archivemin_commits) | 40 (Optional) | Min number of commits to keep before archiving older commits into a sequential log, default 40

`Config Param: ARCHIVE_MIN_COMMITS` | +| [cdc.enabled](#cdcenabled) | false (Optional) | When enable, persist the change data if necessary, and can be queried as a CDC query mode

`Config Param: CDC_ENABLED` | +| [cdc.supplemental.logging.mode](#cdcsupplementalloggingmode) | DATA_BEFORE_AFTER (Optional) | Setting 'op_key_only' persists the 'op' and the record key only, setting 'data_before' persists the additional 'before' image, and setting 'data_before_after' persists the additional 'before' and 'after' images.

`Config Param: SUPPLEMENTAL_LOGGING_MODE` | +| [changelog.enabled](#changelogenabled) | false (Optional) | Whether to keep all the intermediate changes, we try to keep all the changes of a record when enabled: 1). The sink accept the UPDATE_BEFORE message; 2). The source try to emit every changes of a record. The semantics is best effort because the compaction job would finally merge all changes of a record into one. default false to have UPSERT semantics

`Config Param: CHANGELOG_ENABLED` | +| [clean.async.enabled](#cleanasyncenabled) | true (Optional) | Whether to cleanup the old commits immediately on new commits, enabled by default

`Config Param: CLEAN_ASYNC_ENABLED` | +| [clean.retain_commits](#cleanretain_commits) | 30 (Optional) | Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much you can incrementally pull on this table, default 30

`Config Param: CLEAN_RETAIN_COMMITS` | +| [clustering.async.enabled](#clusteringasyncenabled) | false (Optional) | Async Clustering, default false

`Config Param: CLUSTERING_ASYNC_ENABLED` | +| [clustering.plan.strategy.small.file.limit](#clusteringplanstrategysmallfilelimit) | 600 (Optional) | Files smaller than the size specified here are candidates for clustering, default 600 MB

`Config Param: CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT` | +| [clustering.plan.strategy.target.file.max.bytes](#clusteringplanstrategytargetfilemaxbytes) | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB

`Config Param: CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES` | +| [compaction.async.enabled](#compactionasyncenabled) | true (Optional) | Async Compaction, enabled by default for MOR

`Config Param: COMPACTION_ASYNC_ENABLED` | +| [compaction.delta_commits](#compactiondelta_commits) | 5 (Optional) | Max delta commits needed to trigger compaction, default 5 commits

`Config Param: COMPACTION_DELTA_COMMITS` | +| [hive_sync.enabled](#hive_syncenabled) | false (Optional) | Asynchronously sync Hive meta to HMS, default false

`Config Param: HIVE_SYNC_ENABLED` | +| [hive_sync.jdbc_url](#hive_syncjdbc_url) | jdbc:hive2://localhost:10000 (Optional) | Jdbc URL for hive sync, default 'jdbc:hive2://localhost:10000'

`Config Param: HIVE_SYNC_JDBC_URL` | +| [hive_sync.metastore.uris](#hive_syncmetastoreuris) | (Optional) | Metastore uris for hive sync, default ''

`Config Param: HIVE_SYNC_METASTORE_URIS` | +| [hive_sync.mode](#hive_syncmode) | HMS (Optional) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql, default 'hms'

`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot (Optional) | Decides how data files need to be read, in 1) Snapshot mode (obtain latest view, based on row & columnar data); 2) incremental mode (new data since an instantTime); 3) Read Optimized mode (obtain latest view, based on columnar data) .Default: snapshot

`Config Param: QUERY_TYPE` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)

`Config Param: HIVE_STYLE_PARTITIONING` | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | (Optional) | Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual value obtained by invoking .toString(), default ''

`Config Param: PARTITION_PATH_FIELD` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | uuid (Optional) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`

`Config Param: RECORD_KEY_FIELD` | +| [index.type](#indextype) | FLINK_STATE (Optional) | Index type of Flink write job, default is using state backed index.

`Config Param: INDEX_TYPE` | +| [metadata.compaction.delta_commits](#metadatacompactiondelta_commits) | 10 (Optional) | Max delta commits for metadata table to trigger compaction, default 10

`Config Param: METADATA_COMPACTION_DELTA_COMMITS` | +| [metadata.enabled](#metadataenabled) | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings, default enabled

`Config Param: METADATA_ENABLED` | +| [precombine.field](#precombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)

`Config Param: PRECOMBINE_FIELD` | +| [read.streaming.enabled](#readstreamingenabled) | false (Optional) | Whether to read as streaming source, default false

`Config Param: READ_AS_STREAMING` | +| [table.type](#tabletype) | COPY_ON_WRITE (Optional) | Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ

`Config Param: TABLE_TYPE` | +| [write.operation](#writeoperation) | upsert (Optional) | The write operation, that this write should do

`Config Param: OPERATION` | +| [write.parquet.max.file.size](#writeparquetmaxfilesize) | 120 (Optional) | Target size for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.

`Config Param: WRITE_PARQUET_MAX_FILE_SIZE` | --- - ## Write Client Configs {#WRITE_CLIENT} Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads. @@ -156,11 +146,11 @@ Configurations used by the Hudi Metadata Table. This table maintains the metadat [**Basic Configs**](#Metadata-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.metadata.enable](#hoodiemetadataenable) | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings | 0.7.0 | -| [hoodie.metadata.index.bloom.filter.enable](#hoodiemetadataindexbloomfilterenable) | false (Optional) | Enable indexing bloom filters of user data files under metadata table. When enabled, metadata table will have a partition to store the bloom filter index and will be used during the index lookups. | 0.11.0 | -| [hoodie.metadata.index.column.stats.enable](#hoodiemetadataindexcolumnstatsenable) | false (Optional) | Enable indexing column ranges of user data files under metadata table key lookups. When enabled, metadata table will have a partition to store the column ranges and will be used for pruning files during the index lookups. | 0.11.0 | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.metadata.enable](#hoodiemetadataenable) | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings

`Config Param: ENABLE`
`Since Version: 0.7.0` | +| [hoodie.metadata.index.bloom.filter.enable](#hoodiemetadataindexbloomfilterenable) | false (Optional) | Enable indexing bloom filters of user data files under metadata table. When enabled, metadata table will have a partition to store the bloom filter index and will be used during the index lookups.

`Config Param: ENABLE_METADATA_INDEX_BLOOM_FILTER`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.enable](#hoodiemetadataindexcolumnstatsenable) | false (Optional) | Enable indexing column ranges of user data files under metadata table key lookups. When enabled, metadata table will have a partition to store the column ranges and will be used for pruning files during the index lookups.

`Config Param: ENABLE_METADATA_INDEX_COLUMN_STATS`
`Since Version: 0.11.0` | --- @@ -173,10 +163,10 @@ Configurations that control aspects around writing, sizing, reading base and log [**Basic Configs**](#Storage-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------ | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.parquet.compression.codec](#hoodieparquetcompressioncodec) | gzip (Optional) | Compression Codec for parquet files | | -| [hoodie.parquet.max.file.size](#hoodieparquetmaxfilesize) | 125829120 (Optional) | Target size in bytes for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------ | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.parquet.compression.codec](#hoodieparquetcompressioncodec) | gzip (Optional) | Compression Codec for parquet files

`Config Param: PARQUET_COMPRESSION_CODEC_NAME` | +| [hoodie.parquet.max.file.size](#hoodieparquetmaxfilesize) | 125829120 (Optional) | Target size in bytes for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.

`Config Param: PARQUET_MAX_FILE_SIZE` | --- @@ -189,10 +179,10 @@ Configurations that control archival. [**Basic Configs**](#Archival-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------ | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.keep.max.commits](#hoodiekeepmaxcommits) | 30 (Optional) | Archiving service moves older entries from timeline into an archived log after each write, to keep the metadata overhead constant, even as the table size grows. This config controls the maximum number of instants to retain in the active timeline. | | -| [hoodie.keep.min.commits](#hoodiekeepmincommits) | 20 (Optional) | Similar to hoodie.keep.max.commits, but controls the minimum number of instants to retain in the active timeline. | | +| Config Name | Default | Description | +| ------------------------------------------------ | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.keep.max.commits](#hoodiekeepmaxcommits) | 30 (Optional) | Archiving service moves older entries from timeline into an archived log after each write, to keep the metadata overhead constant, even as the table size grows. This config controls the maximum number of instants to retain in the active timeline.

`Config Param: MAX_COMMITS_TO_KEEP` | +| [hoodie.keep.min.commits](#hoodiekeepmincommits) | 20 (Optional) | Similar to hoodie.keep.max.commits, but controls the minimum number of instants to retain in the active timeline.

`Config Param: MIN_COMMITS_TO_KEEP` | --- @@ -205,9 +195,9 @@ Configurations that control how you want to bootstrap your existing tables for t [**Basic Configs**](#Bootstrap-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------ | ------------------ | ---------------------------------------------------------------------- | ------------- | -| [hoodie.bootstrap.base.path](#hoodiebootstrapbasepath) | N/A **(Required)** | Base path of the dataset that needs to be bootstrapped as a Hudi table | 0.6.0 | +| Config Name | Default | Description | +| ------------------------------------------------------ | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.bootstrap.base.path](#hoodiebootstrapbasepath) | N/A **(Required)** | Base path of the dataset that needs to be bootstrapped as a Hudi table

`Config Param: BASE_PATH`
`Since Version: 0.6.0` | --- @@ -220,10 +210,10 @@ Cleaning (reclamation of older/unused file groups/slices). [**Basic Configs**](#Clean-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.clean.async](#hoodiecleanasync) | false (Optional) | Only applies when hoodie.clean.automatic is turned on. When turned on runs cleaner async with writing, which can speed up overall write performance. | | -| [hoodie.cleaner.commits.retained](#hoodiecleanercommitsretained) | 10 (Optional) | Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries. | | +| Config Name | Default | Description | +| ---------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.clean.async](#hoodiecleanasync) | false (Optional) | Only applies when hoodie.clean.automatic is turned on. When turned on runs cleaner async with writing, which can speed up overall write performance.

`Config Param: ASYNC_CLEAN` | +| [hoodie.cleaner.commits.retained](#hoodiecleanercommitsretained) | 10 (Optional) | Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries.

`Config Param: CLEANER_COMMITS_RETAINED` | --- @@ -236,12 +226,12 @@ Configurations that control the clustering table service in hudi, which optimize [**Basic Configs**](#Clustering-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------------------------------- | --------------------- | ----------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table. | 0.7.0 | -| [hoodie.clustering.inline](#hoodieclusteringinline) | false (Optional) | Turn on inline clustering - clustering will be run after each write operation is complete | 0.7.0 | -| [hoodie.clustering.plan.strategy.small.file.limit](#hoodieclusteringplanstrategysmallfilelimit) | 314572800 (Optional) | Files smaller than the size in bytes specified here are candidates for clustering | 0.7.0 | -| [hoodie.clustering.plan.strategy.target.file.max.bytes](#hoodieclusteringplanstrategytargetfilemaxbytes) | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups | 0.7.0 | +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table.

`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline](#hoodieclusteringinline) | false (Optional) | Turn on inline clustering - clustering will be run after each write operation is complete

`Config Param: INLINE_CLUSTERING`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.small.file.limit](#hoodieclusteringplanstrategysmallfilelimit) | 314572800 (Optional) | Files smaller than the size in bytes specified here are candidates for clustering

`Config Param: PLAN_STRATEGY_SMALL_FILE_LIMIT`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.target.file.max.bytes](#hoodieclusteringplanstrategytargetfilemaxbytes) | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups

`Config Param: PLAN_STRATEGY_TARGET_FILE_MAX_BYTES`
`Since Version: 0.7.0` | --- @@ -254,15 +244,15 @@ Configurations that control compaction (merging of log files onto a new base fil [**Basic Configs**](#Compaction-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.compact.inline](#hoodiecompactinline) | false (Optional) | When set to true, compaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path. | | -| [hoodie.compact.inline.max.delta.commits](#hoodiecompactinlinemaxdeltacommits) | 5 (Optional) | Number of delta commits after the last compaction, before scheduling of a new compaction is attempted. This config takes effect only for the compaction triggering strategy based on the number of commits, i.e., NUM_COMMITS, NUM_COMMITS_AFTER_LAST_REQUEST, NUM_AND_TIME, and NUM_OR_TIME. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.compact.inline](#hoodiecompactinline) | false (Optional) | When set to true, compaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path.

`Config Param: INLINE_COMPACT` | +| [hoodie.compact.inline.max.delta.commits](#hoodiecompactinlinemaxdeltacommits) | 5 (Optional) | Number of delta commits after the last compaction, before scheduling of a new compaction is attempted. This config takes effect only for the compaction triggering strategy based on the number of commits, i.e., NUM_COMMITS, NUM_COMMITS_AFTER_LAST_REQUEST, NUM_AND_TIME, and NUM_OR_TIME.

`Config Param: INLINE_COMPACT_NUM_DELTA_COMMITS` | --- ### Write Configurations {#Write-Configurations} -Configurations that control write behavior on Hudi tables. These can be directly passed down from even higher level frameworks (e.g Spark datasources, Flink sink) and utilities (e.g DeltaStreamer). +Configurations that control write behavior on Hudi tables. These can be directly passed down from even higher level frameworks (e.g Spark datasources, Flink sink) and utilities (e.g Hudi Streamer). @@ -270,12 +260,12 @@ Configurations that control write behavior on Hudi tables. These can be directly [**Basic Configs**](#Write-Configurations-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------- | ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.base.path](#hoodiebasepath) | N/A **(Required)** | Base path on lake storage, under which all the table data is stored. Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs etc in .hoodie directory under this base path directory. | | -| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name that will be used for registering with metastores like HMS. Needs to be same across runs. | | -| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | | -| [hoodie.write.concurrency.mode](#hoodiewriteconcurrencymode) | SINGLE_WRITER (Optional) | org.apache.hudi.common.model.WriteConcurrencyMode: Concurrency modes for write operations. SINGLE_WRITER(default): Only one active writer to the table. Maximizes throughput. OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table with lazy conflict resolution using locks. This means that only one writer succeeds if multiple writers write to the same file group. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------------------- | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.base.path](#hoodiebasepath) | N/A **(Required)** | Base path on lake storage, under which all the table data is stored. Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs etc in .hoodie directory under this base path directory.

`Config Param: BASE_PATH` | +| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name that will be used for registering with metastores like HMS. Needs to be same across runs.

`Config Param: TBL_NAME` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)

`Config Param: PRECOMBINE_FIELD_NAME` | +| [hoodie.write.concurrency.mode](#hoodiewriteconcurrencymode) | SINGLE_WRITER (Optional) | org.apache.hudi.common.model.WriteConcurrencyMode: Concurrency modes for write operations. SINGLE_WRITER(default): Only one active writer to the table. Maximizes throughput. OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table with lazy conflict resolution using locks. This means that only one writer succeeds if multiple writers write to the same file group.

`Config Param: WRITE_CONCURRENCY_MODE` | --- @@ -292,11 +282,11 @@ Hudi maintains keys (record key + partition path) for uniquely identifying a par [**Basic Configs**](#Key-Generator-Options-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------ | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | N/A **(Required)** | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString() | | -| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | N/A **(Required)** | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` | | -| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values) | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | N/A **(Required)** | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString()

`Config Param: PARTITIONPATH_FIELD_NAME` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | N/A **(Required)** | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`

`Config Param: RECORDKEY_FIELD_NAME` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)

`Config Param: HIVE_STYLE_PARTITIONING_ENABLE` | --- @@ -313,12 +303,11 @@ Configurations that control indexing behavior, which tags incoming records as ei [**Basic Configs**](#Common-Index-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------- | ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.index.type](#hoodieindextype) | N/A **(Required)** | org.apache.hudi.index.HoodieIndex$IndexType: Determines how input records are indexed, i.e., looked up based on the key for the location in the existing table. Default is SIMPLE on Spark engine, and INMEMORY on Flink and Java engines. HBASE: uses an external managed Apache HBase table to store record key to location mapping. HBase index is a global index, enforcing key uniqueness across all partitions in the table. INMEMORY: Uses in-memory hashmap in Spark and Java engine and Flink in-memory state in Flink for indexing. BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced inside partitions. GLOBAL_BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced across all partitions in the table. SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced inside partitions. GLOBAL_SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced across all partitions in the table. BUCKET: locates the file group containing the record fast by using bucket hashing, particularly beneficial in large scale. Use `hoodie.index.bucket.engine` to choose bucket engine type, i.e., how buckets are generated. FLINK_STATE: Internal Config for indexing based on Flink state. RECORD_INDEX: Index which saves the record key to location mappings in the HUDI Metadata Table. Record index is a global index, enforcing key uniqueness across all partitions in the table. Supports sharding to achieve very high scale. | | +| Config Name | Default | Description | +| ------------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.index.type](#hoodieindextype) | N/A **(Required)** | org.apache.hudi.index.HoodieIndex$IndexType: Determines how input records are indexed, i.e., looked up based on the key for the location in the existing table. Default is SIMPLE on Spark engine, and INMEMORY on Flink and Java engines. HBASE: uses an external managed Apache HBase table to store record key to location mapping. HBase index is a global index, enforcing key uniqueness across all partitions in the table. INMEMORY: Uses in-memory hashmap in Spark and Java engine and Flink in-memory state in Flink for indexing. BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced inside partitions. GLOBAL_BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced across all partitions in the table. SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced inside partitions. GLOBAL_SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced across all partitions in the table. BUCKET: locates the file group containing the record fast by using bucket hashing, particularly beneficial in large scale. Use `hoodie.index.bucket.engine` to choose bucket engine type, i.e., how buckets are generated. FLINK_STATE: Internal Config for indexing based on Flink state. RECORD_INDEX: Index which saves the record key to location mappings in the HUDI Metadata Table. Record index is a global index, enforcing key uniqueness across all partitions in the table. Supports sharding to achieve very high scale.

`Config Param: INDEX_TYPE` | --- - ## Metastore and Catalog Sync Configs {#META_SYNC} Configurations used by the Hudi to sync metadata to external metastores and catalogs. @@ -332,9 +321,9 @@ Configurations used by the Hudi to sync metadata to external metastores and cata [**Basic Configs**](#Common-Metadata-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | --- @@ -347,9 +336,9 @@ Configurations used by the Hudi to sync metadata to Google BigQuery. [**Basic Configs**](#BigQuery-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | --- @@ -362,13 +351,13 @@ Configurations used by the Hudi to sync metadata to Hive Metastore. [**Basic Configs**](#Hive-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------- | --------------------------------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql. | | -| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore. | | -| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url | | -| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url | | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.

`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore.

`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url

`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url

`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | --- @@ -381,13 +370,13 @@ Global replication configurations used by the Hudi to sync metadata to Hive Meta [**Basic Configs**](#Global-Hive-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------- | --------------------------------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql. | | -| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore. | | -| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url | | -| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url | | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.

`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore.

`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url

`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url

`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | --- @@ -400,12 +389,11 @@ Configurations used by the Hudi to sync metadata to DataHub. [**Basic Configs**](#DataHub-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | --- - ## Metrics Configs {#METRICS} These set of configs are used to enable monitoring and reporting of keyHudi stats and metrics. @@ -419,13 +407,13 @@ Enables reporting on Hudi metrics. Hudi publishes metrics on every commit, clean [**Basic Configs**](#Metrics-Configurations-basic-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------- | ------------------- | ---------------------------------------------- | ------------- | -| [hoodie.metrics.on](#hoodiemetricson) | false (Optional) | Turn on/off metrics reporting. off by default. | 0.5.0 | -| [hoodie.metrics.reporter.type](#hoodiemetricsreportertype) | GRAPHITE (Optional) | Type of metrics reporter. | 0.5.0 | +| Config Name | Default | Description | +| ----------------------------------------------------------------------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.on](#hoodiemetricson) | false (Optional) | Turn on/off metrics reporting. off by default.

`Config Param: TURN_METRICS_ON`
`Since Version: 0.5.0` | +| [hoodie.metrics.reporter.type](#hoodiemetricsreportertype) | GRAPHITE (Optional) | Type of metrics reporter.

`Config Param: METRICS_REPORTER_TYPE_VALUE`
`Since Version: 0.5.0` | +| [hoodie.metricscompaction.log.blocks.on](#hoodiemetricscompactionlogblockson) | false (Optional) | Turn on/off metrics reporting for log blocks with compaction commit. off by default.

`Config Param: TURN_METRICS_COMPACTION_LOG_BLOCKS_ON`
`Since Version: 0.14.0` | --- - ## Kafka Connect Configs {#KAFKA_CONNECT} These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables @@ -439,175 +427,28 @@ Configurations for Kafka Connect Sink Connector for Hudi. [**Basic Configs**](#Kafka-Sink-Connect-Configurations-basic-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------- | ------------------------- | -------------------------------------------- | ------------- | -| [bootstrap.servers](#bootstrapservers) | localhost:9092 (Optional) | The bootstrap servers for the Kafka Cluster. | | ---- - - -## DeltaStreamer Configs {#DELTA_STREAMER} -These set of configs are used for DeltaStreamer utility which provides the way to ingest from different sources such as DFS or Kafka. - - -### DeltaStreamer Configs {#DeltaStreamer-Configs} -Configurations that control DeltaStreamer. - - - - -[**Basic Configs**](#DeltaStreamer-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------- | ------------------ | ----------------- | ------------- | -| [hoodie.deltastreamer.source.kafka.topic](#hoodiedeltastreamersourcekafkatopic) | N/A **(Required)** | Kafka topic name. | | ---- - - -### DeltaStreamer SQL Transformer Configs {#DeltaStreamer-SQL-Transformer-Configs} -Configurations controlling the behavior of SQL transformer in Deltastreamer. - - - - -[**Basic Configs**](#DeltaStreamer-SQL-Transformer-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------- | ------------------ | -------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.transformer.sql](#hoodiedeltastreamertransformersql) | N/A **(Required)** | SQL Query to be executed during write | | -| [hoodie.deltastreamer.transformer.sql.file](#hoodiedeltastreamertransformersqlfile) | N/A **(Required)** | File with a SQL script to be executed during write | | ---- - - -### DeltaStreamer Source Configs {#DELTA_STREAMER_SOURCE} -Configurations controlling the behavior of reading source data. - - -#### DFS Path Selector Configs {#DFS-Path-Selector-Configs} -Configurations controlling the behavior of path selector for DFS source in Deltastreamer. - - - - -[**Basic Configs**](#DFS-Path-Selector-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------- | ------------------ | ------------------------------ | ------------- | -| [hoodie.deltastreamer.source.dfs.root](#hoodiedeltastreamersourcedfsroot) | N/A **(Required)** | Root path of the source on DFS | | ---- - - -#### Hudi Incremental Source Configs {#Hudi-Incremental-Source-Configs} -Configurations controlling the behavior of incremental pulling from a Hudi table as a source in Deltastreamer. - - - - -[**Basic Configs**](#Hudi-Incremental-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------- | ------------------ | ----------------------------------- | ------------- | -| [hoodie.deltastreamer.source.hoodieincr.path](#hoodiedeltastreamersourcehoodieincrpath) | N/A **(Required)** | Base-path for the source Hudi table | | +| Config Name | Default | Description | +| -------------------------------------- | ------------------------- | ----------------------------------------------------------------------------------------------- | +| [bootstrap.servers](#bootstrapservers) | localhost:9092 (Optional) | The bootstrap servers for the Kafka Cluster.

`Config Param: KAFKA_BOOTSTRAP_SERVERS` | --- +## Hudi Streamer Configs {#HUDI_STREAMER} +These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. -#### Kafka Source Configs {#Kafka-Source-Configs} -Configurations controlling the behavior of Kafka source in Deltastreamer. - - - - -[**Basic Configs**](#Kafka-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------- | ------------------ | ----------------- | ------------- | -| [hoodie.deltastreamer.source.kafka.topic](#hoodiedeltastreamersourcekafkatopic) | N/A **(Required)** | Kafka topic name. | | ---- - - -#### Pulsar Source Configs {#Pulsar-Source-Configs} -Configurations controlling the behavior of Pulsar source in Deltastreamer. - - - - -[**Basic Configs**](#Pulsar-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------- | ---------------------------------- | ------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.pulsar.topic](#hoodiedeltastreamersourcepulsartopic) | N/A **(Required)** | Name of the target Pulsar topic to source data from | | -| [hoodie.deltastreamer.source.pulsar.endpoint.admin.url](#hoodiedeltastreamersourcepulsarendpointadminurl) | http://localhost:8080 (Optional) | URL of the target Pulsar endpoint (of the form 'pulsar://host:port' | | -| [hoodie.deltastreamer.source.pulsar.endpoint.service.url](#hoodiedeltastreamersourcepulsarendpointserviceurl) | pulsar://localhost:6650 (Optional) | URL of the target Pulsar endpoint (of the form 'pulsar://host:port' | | ---- - - -#### S3 Source Configs {#S3-Source-Configs} -Configurations controlling the behavior of S3 source in Deltastreamer. - - - - -[**Basic Configs**](#S3-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------- | ------------------ | --------------------------------- | ------------- | -| [hoodie.deltastreamer.s3.source.queue.url](#hoodiedeltastreamers3sourcequeueurl) | N/A **(Required)** | Queue url for cloud object events | | ---- - - -#### SQL Source Configs {#SQL-Source-Configs} -Configurations controlling the behavior of SQL source in Deltastreamer. - - - - -[**Basic Configs**](#SQL-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------- | ------------------ | ----------------------------------- | ------------- | -| [hoodie.deltastreamer.source.sql.sql.query](#hoodiedeltastreamersourcesqlsqlquery) | N/A **(Required)** | SQL query for fetching source data. | | ---- - - -### DeltaStreamer Schema Provider Configs {#SCHEMA_PROVIDER} -Configurations that control the schema provider for DeltaStreamer. - - -#### DeltaStreamer Schema Provider Configs {#DeltaStreamer-Schema-Provider-Configs} - - - - - -[**Basic Configs**](#DeltaStreamer-Schema-Provider-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------- | ------------------ | ------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.registry.targetUrl](#hoodiedeltastreamerschemaproviderregistrytargetUrl) | N/A **(Required)** | The schema of the target you are writing to e.g. https://foo:bar@schemaregistry.org | | -| [hoodie.deltastreamer.schemaprovider.registry.url](#hoodiedeltastreamerschemaproviderregistryurl) | N/A **(Required)** | The schema of the source you are reading from e.g. https://foo:bar@schemaregistry.org | | ---- +### Hudi Streamer Configs {#Hudi-Streamer-Configs} -#### File-based Schema Provider Configs {#File-based-Schema-Provider-Configs} -Configurations for file-based schema provider. -[**Basic Configs**](#File-based-Schema-Provider-Configs-basic-configs) +[**Basic Configs**](#Hudi-Streamer-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------ | ------------------ | --------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.source.schema.file](#hoodiedeltastreamerschemaprovidersourceschemafile) | N/A **(Required)** | The schema of the source you are reading from | | -| [hoodie.deltastreamer.schemaprovider.target.schema.file](#hoodiedeltastreamerschemaprovidertargetschemafile) | N/A **(Required)** | The schema of the target you are writing to | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.deltastreamer.source.kafka.topic](#hoodiedeltastreamersourcekafkatopic) | N/A **(Required)** | Kafka topic name. The config is specific to HoodieMultiTableDeltaStreamer

`Config Param: KAFKA_TOPIC` | +| [hoodie.deltastreamer.sample.writes.enabled](#hoodiedeltastreamersamplewritesenabled) | false (Optional) | Set this to true to sample from the first batch of records and write to the auxiliary path, before writing to the table.The sampled records are used to calculate the average record size. The relevant write client will have `hoodie.copyonwrite.record.size.estimate` being overwritten by the calculated result.

`Config Param: SAMPLE_WRITES_ENABLED` | +| [hoodie.deltastreamer.sample.writes.size](#hoodiedeltastreamersamplewritessize) | 5000 (Optional) | Number of records to sample from the first write. To improve the estimation's accuracy, for smaller or more compressable record size, set the sample size bigger. For bigger or less compressable record size, set smaller.

`Config Param: SAMPLE_WRITES_SIZE` | --- diff --git a/website/docs/configurations.md b/website/docs/configurations.md index 270636c5f1d6..41c5752e8f2a 100644 --- a/website/docs/configurations.md +++ b/website/docs/configurations.md @@ -3,14 +3,11 @@ title: All Configurations keywords: [ configurations, default, flink options, spark, configs, parameters ] permalink: /docs/configurations.html summary: This page covers the different ways of configuring your job to write/read Hudi tables. At a high level, you can control behaviour at few levels. -last_modified_at: 2023-07-07T17:00:30.441 -hide_table_of_contents: true +toc_min_heading_level: 2 +toc_max_heading_level: 4 +last_modified_at: 2023-07-21T07:02:09.433 --- -import TOCInline from '@theme/TOCInline'; - - ---- This page covers the different ways of configuring your job to write/read Hudi tables. At a high level, you can control behaviour at few levels. @@ -31,7 +28,7 @@ By default, Hudi would load the configuration file under `/etc/hudi/conf` direct - [**Record Payload Config**](#RECORD_PAYLOAD): This is the lowest level of customization offered by Hudi. Record payloads define how to produce new values to upsert based on incoming new record and stored old record. Hudi provides default implementations such as OverwriteWithLatestAvroPayload which simply update table with the latest/last-written record. This can be overridden to a custom class extending HoodieRecordPayload class, on both datasource and WriteClient levels. - [**Kafka Connect Configs**](#KAFKA_CONNECT): These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables - [**Amazon Web Services Configs**](#AWS): Configurations specific to Amazon Web Services. -- [**DeltaStreamer Configs**](#DELTA_STREAMER): These set of configs are used for DeltaStreamer utility which provides the way to ingest from different sources such as DFS or Kafka. +- [**Hudi Streamer Configs**](#HUDI_STREAMER): These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. ## Externalized Config File Instead of directly passing configuration settings to every Hudi job, you can also centrally set them in a configuration @@ -52,34 +49,34 @@ Options useful for reading tables via `read.format.option(...)` [**Basic Configs**](#Read-Options-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.read.begin.instanttime](#hoodiedatasourcereadbegininstanttime) | N/A **(Required)** | Required when `hoodie.datasource.query.type` is set to `incremental`. Represents the instant time to start incrementally pulling data from. The instanttime here need not necessarily correspond to an instant on the timeline. New data written with an instant_time > BEGIN_INSTANTTIME are fetched out. For e.g: ‘20170901080000’ will get all new data written after Sep 1, 2017 08:00AM. Note that if `hoodie.datasource.read.handle.hollow.commit` set to USE_STATE_TRANSITION_TIME, will use instant's `stateTransitionTime` to perform comparison. | | -| [hoodie.datasource.read.end.instanttime](#hoodiedatasourcereadendinstanttime) | N/A **(Required)** | Used when `hoodie.datasource.query.type` is set to `incremental`. Represents the instant time to limit incrementally fetched data to. When not specified latest commit time from timeline is assumed by default. When specified, new data written with an instant_time <= END_INSTANTTIME are fetched out. Point in time type queries makes more sense with begin and end instant times specified. Note that if `hoodie.datasource.read.handle.hollow.commit` set to `USE_STATE_TRANSITION_TIME`, will use instant's `stateTransitionTime` to perform comparison. | | -| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot (Optional) | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files) | | -| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | | +| Config Name | Default | Description | +| --------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.read.begin.instanttime](#hoodiedatasourcereadbegininstanttime) | N/A **(Required)** | Required when `hoodie.datasource.query.type` is set to `incremental`. Represents the instant time to start incrementally pulling data from. The instanttime here need not necessarily correspond to an instant on the timeline. New data written with an instant_time > BEGIN_INSTANTTIME are fetched out. For e.g: ‘20170901080000’ will get all new data written after Sep 1, 2017 08:00AM. Note that if `hoodie.datasource.read.handle.hollow.commit` set to USE_STATE_TRANSITION_TIME, will use instant's `stateTransitionTime` to perform comparison.

`Config Param: BEGIN_INSTANTTIME` | +| [hoodie.datasource.read.end.instanttime](#hoodiedatasourcereadendinstanttime) | N/A **(Required)** | Used when `hoodie.datasource.query.type` is set to `incremental`. Represents the instant time to limit incrementally fetched data to. When not specified latest commit time from timeline is assumed by default. When specified, new data written with an instant_time <= END_INSTANTTIME are fetched out. Point in time type queries make more sense with begin and end instant times specified. Note that if `hoodie.datasource.read.handle.hollow.commit` set to `USE_STATE_TRANSITION_TIME`, will use instant's `stateTransitionTime` to perform comparison.

`Config Param: END_INSTANTTIME` | +| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot (Optional) | Whether data needs to be read, in `incremental` mode (new data since an instantTime) (or) `read_optimized` mode (obtain latest view, based on base files) (or) `snapshot` mode (obtain latest view, by merging base and (if any) log files)

`Config Param: QUERY_TYPE` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)

`Config Param: READ_PRE_COMBINE_FIELD` | [**Advanced Configs**](#Read-Options-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [as.of.instant](#asofinstant) | N/A **(Required)** | The query instant for time travel. Without specified this option, we query the latest snapshot. | | -| [hoodie.datasource.read.paths](#hoodiedatasourcereadpaths) | N/A **(Required)** | Comma separated list of file paths to read within a Hudi table. | | -| [hoodie.datasource.merge.type](#hoodiedatasourcemergetype) | payload_combine (Optional) | For Snapshot query on merge on read table, control whether we invoke the record payload implementation to merge (payload_combine) or skip merging altogetherskip_merge | | -| [hoodie.datasource.query.incremental.format](#hoodiedatasourcequeryincrementalformat) | latest_state (Optional) | This config is used alone with the 'incremental' query type.When set to 'latest_state', it returns the latest records' values.When set to 'cdc', it returns the cdc data. | 0.13.0 | -| [hoodie.datasource.read.extract.partition.values.from.path](#hoodiedatasourcereadextractpartitionvaluesfrompath) | false (Optional) | When set to true, values for partition columns (partition values) will be extracted from physical partition path (default Spark behavior). When set to false partition values will be read from the data file (in Hudi partition columns are persisted by default). This config is a fallback allowing to preserve existing behavior, and should not be used otherwise. | 0.11.0 | -| [hoodie.datasource.read.file.index.listing.mode](#hoodiedatasourcereadfileindexlistingmode) | lazy (Optional) | Overrides Hudi's file-index implementation's file listing mode: when set to 'eager', file-index will list all partition paths and corresponding file slices w/in them eagerly, during initialization, prior to partition-pruning kicking in, meaning that all partitions will be listed including ones that might be subsequently pruned out; when set to 'lazy', partitions and file-slices w/in them will be listed lazily (ie when they actually accessed, instead of when file-index is initialized) allowing partition pruning to occur before that, only listing partitions that has already been pruned. Please note that, this config is provided purely to allow to fallback to behavior existing prior to 0.13.0 release, and will be deprecated soon after. | 0.13.0 | -| [hoodie.datasource.read.file.index.listing.partition-path-prefix.analysis.enabled](#hoodiedatasourcereadfileindexlistingpartition-path-prefixanalysisenabled) | true (Optional) | Controls whether partition-path prefix analysis is enabled w/in the file-index, allowing to avoid necessity to recursively list deep folder structures of partitioned tables w/ multiple partition columns, by carefully analyzing provided partition-column predicates and deducing corresponding partition-path prefix from them (if possible). | 0.13.0 | -| [hoodie.datasource.read.handle.hollow.commit](#hoodiedatasourcereadhandlehollowcommit) | EXCEPTION (Optional) | When doing incremental queries, there could be hollow commits (requested or inflight commits that are not the latest) that are produced by concurrent writers and could lead to potential data loss. This config allows users to have different ways of handling this situation. The valid values are [EXCEPTION, BLOCK, USE_STATE_TRANSITION_TIME]: Use `EXCEPTION` to throw an exception when hollow commit is detected. This is helpful when hollow commits are not expected. Use `BLOCK` to block processing commits from going beyond the hollow ones. This fits the case where waiting for hollow commits to finish is acceptable. Use `USE_STATE_TRANSITION_TIME` (experimental) to query commits in range by state transition time (completion time), instead of commit time (start time). Using this mode will result in `begin.instanttime` and `end.instanttime` using `stateTransitionTime` instead of the instant's commit time. | 0.14.0 | -| [hoodie.datasource.read.incr.fallback.fulltablescan.enable](#hoodiedatasourcereadincrfallbackfulltablescanenable) | false (Optional) | When doing an incremental query whether we should fall back to full table scans if file does not exist. | | -| [hoodie.datasource.read.incr.filters](#hoodiedatasourcereadincrfilters) | (Optional) | For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies opaque map functions, filters appearing late in the sequence of transformations cannot be automatically pushed down. This option allows setting filters directly on Hoodie Source. | | -| [hoodie.datasource.read.incr.path.glob](#hoodiedatasourcereadincrpathglob) | (Optional) | For the use-cases like users only want to incremental pull from certain partitions instead of the full table. This option allows using glob pattern to directly filter on path. | | -| [hoodie.datasource.read.schema.use.end.instanttime](#hoodiedatasourcereadschemauseendinstanttime) | false (Optional) | Uses end instant schema when incrementally fetched data to. Default: users latest instant schema. | | -| [hoodie.datasource.streaming.startOffset](#hoodiedatasourcestreamingstartOffset) | earliest (Optional) | Start offset to pull data from hoodie streaming source. allow earliest, latest, and specified start instant time | 0.13.0 | -| [hoodie.enable.data.skipping](#hoodieenabledataskipping) | false (Optional) | Enables data-skipping allowing queries to leverage indexes to reduce the search space by skipping over files | 0.10.0 | -| [hoodie.file.index.enable](#hoodiefileindexenable) | true (Optional) | Enables use of the spark file index implementation for Hudi, that speeds up listing of large tables. | | -| [hoodie.schema.on.read.enable](#hoodieschemaonreadenable) | false (Optional) | Enables support for Schema Evolution feature | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [as.of.instant](#asofinstant) | N/A **(Required)** | The query instant for time travel. Without specified this option, we query the latest snapshot.

`Config Param: TIME_TRAVEL_AS_OF_INSTANT` | +| [hoodie.datasource.read.paths](#hoodiedatasourcereadpaths) | N/A **(Required)** | Comma separated list of file paths to read within a Hudi table.

`Config Param: READ_PATHS` | +| [hoodie.datasource.merge.type](#hoodiedatasourcemergetype) | payload_combine (Optional) | For Snapshot query on merge on read table, control whether we invoke the record payload implementation to merge (payload_combine) or skip merging altogetherskip_merge

`Config Param: REALTIME_MERGE` | +| [hoodie.datasource.query.incremental.format](#hoodiedatasourcequeryincrementalformat) | latest_state (Optional) | This config is used alone with the 'incremental' query type.When set to 'latest_state', it returns the latest records' values.When set to 'cdc', it returns the cdc data.

`Config Param: INCREMENTAL_FORMAT`
`Since Version: 0.13.0` | +| [hoodie.datasource.read.extract.partition.values.from.path](#hoodiedatasourcereadextractpartitionvaluesfrompath) | false (Optional) | When set to true, values for partition columns (partition values) will be extracted from physical partition path (default Spark behavior). When set to false partition values will be read from the data file (in Hudi partition columns are persisted by default). This config is a fallback allowing to preserve existing behavior, and should not be used otherwise.

`Config Param: EXTRACT_PARTITION_VALUES_FROM_PARTITION_PATH`
`Since Version: 0.11.0` | +| [hoodie.datasource.read.file.index.listing.mode](#hoodiedatasourcereadfileindexlistingmode) | lazy (Optional) | Overrides Hudi's file-index implementation's file listing mode: when set to 'eager', file-index will list all partition paths and corresponding file slices w/in them eagerly, during initialization, prior to partition-pruning kicking in, meaning that all partitions will be listed including ones that might be subsequently pruned out; when set to 'lazy', partitions and file-slices w/in them will be listed lazily (ie when they actually accessed, instead of when file-index is initialized) allowing partition pruning to occur before that, only listing partitions that has already been pruned. Please note that, this config is provided purely to allow to fallback to behavior existing prior to 0.13.0 release, and will be deprecated soon after.

`Config Param: FILE_INDEX_LISTING_MODE_OVERRIDE`
`Since Version: 0.13.0` | +| [hoodie.datasource.read.file.index.listing.partition-path-prefix.analysis.enabled](#hoodiedatasourcereadfileindexlistingpartition-path-prefixanalysisenabled) | true (Optional) | Controls whether partition-path prefix analysis is enabled w/in the file-index, allowing to avoid necessity to recursively list deep folder structures of partitioned tables w/ multiple partition columns, by carefully analyzing provided partition-column predicates and deducing corresponding partition-path prefix from them (if possible).

`Config Param: FILE_INDEX_LISTING_PARTITION_PATH_PREFIX_ANALYSIS_ENABLED`
`Since Version: 0.13.0` | +| [hoodie.datasource.read.handle.hollow.commit](#hoodiedatasourcereadhandlehollowcommit) | EXCEPTION (Optional) | When doing incremental queries, there could be hollow commits (requested or inflight commits that are not the latest) that are produced by concurrent writers and could lead to potential data loss. This config allows users to have different ways of handling this situation. The valid values are [EXCEPTION, BLOCK, USE_STATE_TRANSITION_TIME]: Use `EXCEPTION` to throw an exception when hollow commit is detected. This is helpful when hollow commits are not expected. Use `BLOCK` to block processing commits from going beyond the hollow ones. This fits the case where waiting for hollow commits to finish is acceptable. Use `USE_STATE_TRANSITION_TIME` (experimental) to query commits in range by state transition time (completion time), instead of commit time (start time). Using this mode will result in `begin.instanttime` and `end.instanttime` using `stateTransitionTime` instead of the instant's commit time.

`Config Param: INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT`
`Since Version: 0.14.0` | +| [hoodie.datasource.read.incr.fallback.fulltablescan.enable](#hoodiedatasourcereadincrfallbackfulltablescanenable) | false (Optional) | When doing an incremental query whether we should fall back to full table scans if file does not exist.

`Config Param: INCREMENTAL_FALLBACK_TO_FULL_TABLE_SCAN_FOR_NON_EXISTING_FILES` | +| [hoodie.datasource.read.incr.filters](#hoodiedatasourcereadincrfilters) | (Optional) | For use-cases like DeltaStreamer which reads from Hoodie Incremental table and applies opaque map functions, filters appearing late in the sequence of transformations cannot be automatically pushed down. This option allows setting filters directly on Hoodie Source.

`Config Param: PUSH_DOWN_INCR_FILTERS` | +| [hoodie.datasource.read.incr.path.glob](#hoodiedatasourcereadincrpathglob) | (Optional) | For the use-cases like users only want to incremental pull from certain partitions instead of the full table. This option allows using glob pattern to directly filter on path.

`Config Param: INCR_PATH_GLOB` | +| [hoodie.datasource.read.schema.use.end.instanttime](#hoodiedatasourcereadschemauseendinstanttime) | false (Optional) | Uses end instant schema when incrementally fetched data to. Default: users latest instant schema.

`Config Param: INCREMENTAL_READ_SCHEMA_USE_END_INSTANTTIME` | +| [hoodie.datasource.streaming.startOffset](#hoodiedatasourcestreamingstartOffset) | earliest (Optional) | Start offset to pull data from hoodie streaming source. allow earliest, latest, and specified start instant time

`Config Param: START_OFFSET`
`Since Version: 0.13.0` | +| [hoodie.enable.data.skipping](#hoodieenabledataskipping) | false (Optional) | Enables data-skipping allowing queries to leverage indexes to reduce the search space by skipping over files

`Config Param: ENABLE_DATA_SKIPPING`
`Since Version: 0.10.0` | +| [hoodie.file.index.enable](#hoodiefileindexenable) | true (Optional) | Enables use of the spark file index implementation for Hudi, that speeds up listing of large tables.

`Config Param: ENABLE_HOODIE_FILE_INDEX` | +| [hoodie.schema.on.read.enable](#hoodieschemaonreadenable) | false (Optional) | Enables support for Schema Evolution feature

`Config Param: SCHEMA_EVOLUTION_ENABLED` | --- @@ -106,72 +103,75 @@ Options useful for writing tables via `write.format.option(...)` [**Basic Configs**](#Write-Options-basic-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------------------------------- | --------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql. | | -| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | N/A **(Required)** | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString() | | -| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | N/A **(Required)** | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` | | -| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table. | 0.7.0 | -| [hoodie.clustering.inline](#hoodieclusteringinline) | false (Optional) | Turn on inline clustering - clustering will be run after each write operation is complete | 0.7.0 | -| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore. | | -| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url | | -| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url | | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | -| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values) | | -| [hoodie.datasource.write.operation](#hoodiedatasourcewriteoperation) | upsert (Optional) | Whether to do upsert, insert or bulk_insert for the write operation. Use bulk_insert to load new data into a table, and there on use upsert/insert. bulk insert uses a disk based write path to scale to load large inputs without need to cache it. | | -| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | | -| [hoodie.datasource.write.streaming.disable.compaction](#hoodiedatasourcewritestreamingdisablecompaction) | false (Optional) | By default for MOR table, async compaction is enabled with spark streaming sink. By setting this config to true, we can disable it and the expectation is that, users will schedule and execute compaction in a different process/job altogether. Some users may wish to run it separately to manage resources across table services and regular ingestion pipeline and so this could be preferred on such cases. | 0.14.0 | -| [hoodie.datasource.write.table.type](#hoodiedatasourcewritetabletype) | COPY_ON_WRITE (Optional) | The table type for the underlying data, for this write. This can’t change between writes. | | -| [hoodie.sql.insert.mode](#hoodiesqlinsertmode) | upsert (Optional) | Insert mode when insert data to pk-table. The optional modes are: upsert, strict and non-strict.For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record.For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record.While for non-strict mode, hudi just do the insert operation for the pk-table. | | +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.

`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | N/A **(Required)** | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString()

`Config Param: PARTITIONPATH_FIELD` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | N/A **(Required)** | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`

`Config Param: RECORDKEY_FIELD` | +| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table.

`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline](#hoodieclusteringinline) | false (Optional) | Turn on inline clustering - clustering will be run after each write operation is complete

`Config Param: INLINE_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore.

`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url

`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url

`Config Param: METASTORE_URIS` | +| [hoodie.datasource.insert.dup.policy](#hoodiedatasourceinsertduppolicy) | none (Optional) | When operation type is set to "insert", users can optionally enforce a dedup policy. This policy will be employed when records being ingested already exists in storage. Default policy is none and no action will be taken. Another option is to choose "drop", on which matching records from incoming will be dropped and the rest will be ingested. Third option is "fail" which will fail the write operation when same records are re-ingested. In other words, a given record as deduced by the key generation policy can be ingested only once to the target table of interest.

`Config Param: INSERT_DUP_POLICY` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)

`Config Param: HIVE_STYLE_PARTITIONING` | +| [hoodie.datasource.write.operation](#hoodiedatasourcewriteoperation) | upsert (Optional) | Whether to do upsert, insert or bulk_insert for the write operation. Use bulk_insert to load new data into a table, and there on use upsert/insert. bulk insert uses a disk based write path to scale to load large inputs without need to cache it.

`Config Param: OPERATION` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)

`Config Param: PRECOMBINE_FIELD` | +| [hoodie.datasource.write.streaming.disable.compaction](#hoodiedatasourcewritestreamingdisablecompaction) | false (Optional) | By default for MOR table, async compaction is enabled with spark streaming sink. By setting this config to true, we can disable it and the expectation is that, users will schedule and execute compaction in a different process/job altogether. Some users may wish to run it separately to manage resources across table services and regular ingestion pipeline and so this could be preferred on such cases.

`Config Param: STREAMING_DISABLE_COMPACTION`
`Since Version: 0.14.0` | +| [hoodie.datasource.write.table.type](#hoodiedatasourcewritetabletype) | COPY_ON_WRITE (Optional) | The table type for the underlying data, for this write. This can’t change between writes.

`Config Param: TABLE_TYPE` | +| [hoodie.sql.insert.mode](#hoodiesqlinsertmode) | upsert (Optional) | Insert mode when insert data to pk-table. The optional modes are: upsert, strict and non-strict.For upsert mode, insert statement do the upsert operation for the pk-table which will update the duplicate record.For strict mode, insert statement will keep the primary key uniqueness constraint which do not allow duplicate record.While for non-strict mode, hudi just do the insert operation for the pk-table. This config is deprecated as of 0.14.0. Please use hoodie.sql.write.operation and hoodie.datasource.insert.dup.policy as you see fit.

`Config Param: SQL_INSERT_MODE` | +| [hoodie.sql.write.operation](#hoodiesqlwriteoperation) | insert (Optional) | Sql write operation to use with INSERT_INTO spark sql command. This comes with 3 possible values, bulk_insert, insert and upsert. bulk_insert is generally meant for initial loads and is known to be performant compared to insert. But bulk_insert may not do small file managmeent. If you prefer hudi to automatically managee small files, then you can go with "insert". There is no precombine (if there are duplicates within the same batch being ingested, same dups will be ingested) with bulk_insert and insert and there is no index look up as well. If you may use INSERT_INTO for mutable dataset, then you may have to set this config value to "upsert". With upsert, you will get both precombine and updates to existing records on storage is also honored. If not, you may see duplicates.

`Config Param: SQL_WRITE_OPERATION` | [**Advanced Configs**](#Write-Options-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------- | -| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | N/A **(Required)** | Serde properties to hive table. | | -| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | N/A **(Required)** | Additional properties to store with table. | | -| [hoodie.datasource.write.partitions.to.delete](#hoodiedatasourcewritepartitionstodelete) | N/A **(Required)** | Comma separated list of partitions to delete. Allows use of wildcard * | | -| [hoodie.datasource.write.table.name](#hoodiedatasourcewritetablename) | N/A **(Required)** | Table name for the datasource write. Also used to register the table into meta stores. | | -| [hoodie.datasource.compaction.async.enable](#hoodiedatasourcecompactionasyncenable) | true (Optional) | Controls whether async compaction should be turned on for MOR table writing. | | -| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd | | -| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true (Optional) | Auto create hive database if does not exists | | -| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync. | | -| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 (Optional) | The number of partitions one batch when synchronous partitions to hive. | | -| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false (Optional) | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS' | | -| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false (Optional) | Whether to sync the table as managed table. | | -| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false (Optional) | Ignore exceptions when syncing with Hive. | | -| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'. | | -| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns. | | -| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive (Optional) | hive password to use | | -| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false (Optional) | Skip the _ro suffix for Read optimized table, when registering | | -| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false (Optional) | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility. | | -| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true (Optional) | | | -| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false (Optional) | Whether to sync the table column comments while syncing the table. | | -| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true (Optional) | Use JDBC when hive synchronization is enabled | | -| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false (Optional) | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format | | -| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive (Optional) | hive user name to use | | -| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change. | | -| [hoodie.datasource.write.commitmeta.key.prefix](#hoodiedatasourcewritecommitmetakeyprefix) | _ (Optional) | Option keys beginning with this prefix, are automatically added to the commit/deltacommit metadata. This is useful to store checkpointing information, in a consistent way with the hudi timeline | | -| [hoodie.datasource.write.drop.partition.columns](#hoodiedatasourcewritedroppartitioncolumns) | false (Optional) | When set to true, will not write the partition columns into hudi. By default, false. | | -| [hoodie.datasource.write.insert.drop.duplicates](#hoodiedatasourcewriteinsertdropduplicates) | false (Optional) | If set to true, records from the incoming dataframe will not overwrite existing records with the same key during the write operation. | | -| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | org.apache.hudi.keygen.SimpleKeyGenerator (Optional) | Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator` | | -| [hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled](#hoodiedatasourcewritekeygeneratorconsistentlogicaltimestampenabled) | false (Optional) | When set to true, consistent value will be generated for a logical timestamp type column, like timestamp-millis and timestamp-micros, irrespective of whether row-writer is enabled. Disabled by default so as not to break the pipeline that deploy either fully row-writer path or non row-writer path. For example, if it is kept disabled then record key of timestamp type with value `2016-12-29 09:54:00` will be written as timestamp `2016-12-29 09:54:00.0` in row-writer path, while it will be written as long value `1483023240000000` in non row-writer path. If enabled, then the timestamp value will be written in both the cases. | | -| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false (Optional) | Should we url encode the partition path value, before creating the folder structure. | | -| [hoodie.datasource.write.payload.class](#hoodiedatasourcewritepayloadclass) | org.apache.hudi.common.model.OverwriteWithLatestAvroPayload (Optional) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective | | -| [hoodie.datasource.write.reconcile.schema](#hoodiedatasourcewritereconcileschema) | false (Optional) | This config controls how writer's schema will be selected based on the incoming batch's schema as well as existing table's one. When schema reconciliation is DISABLED, incoming batch's schema will be picked as a writer-schema (therefore updating table's schema). When schema reconciliation is ENABLED, writer-schema will be picked such that table's schema (after txn) is either kept the same or extended, meaning that we'll always prefer the schema that either adds new columns or stays the same. This enables us, to always extend the table's schema during evolution and never lose the data (when, for ex, existing column is being dropped in a new batch) | | -| [hoodie.datasource.write.record.merger.impls](#hoodiedatasourcewriterecordmergerimpls) | org.apache.hudi.common.model.HoodieAvroRecordMerger (Optional) | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These merger impls will filter by hoodie.datasource.write.record.merger.strategy Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc) | 0.13.0 | -| [hoodie.datasource.write.record.merger.strategy](#hoodiedatasourcewriterecordmergerstrategy) | eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 (Optional) | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in hoodie.datasource.write.record.merger.impls which has the same merger strategy id | 0.13.0 | -| [hoodie.datasource.write.row.writer.enable](#hoodiedatasourcewriterowwriterenable) | true (Optional) | When set to true, will perform write operations directly using the spark native `Row` representation, avoiding any additional conversion costs. | | -| [hoodie.datasource.write.streaming.checkpoint.identifier](#hoodiedatasourcewritestreamingcheckpointidentifier) | default_single_writer (Optional) | A stream identifier used for HUDI to fetch the right checkpoint(`batch id` to be more specific) corresponding this writer. Please note that keep the identifier an unique value for different writer if under multi-writer scenario. If the value is not set, will only keep the checkpoint info in the memory. This could introduce the potential issue that the job is restart(`batch id` is lost) while spark checkpoint write fails, causing spark will retry and rewrite the data. | 0.13.0 | -| [hoodie.datasource.write.streaming.ignore.failed.batch](#hoodiedatasourcewritestreamingignorefailedbatch) | false (Optional) | Config to indicate whether to ignore any non exception error (e.g. writestatus error) within a streaming microbatch. Turning this on, could hide the write status errors while the spark checkpoint moves ahead.So, would recommend users to use this with caution. | | -| [hoodie.datasource.write.streaming.retry.count](#hoodiedatasourcewritestreamingretrycount) | 3 (Optional) | Config to indicate how many times streaming job should retry for a failed micro batch. | | -| [hoodie.datasource.write.streaming.retry.interval.ms](#hoodiedatasourcewritestreamingretryintervalms) | 2000 (Optional) | Config to indicate how long (by millisecond) before a retry should issued for failed microbatch | | -| [hoodie.deltastreamer.source.kafka.value.deserializer.class](#hoodiedeltastreamersourcekafkavaluedeserializerclass) | io.confluent.kafka.serializers.KafkaAvroDeserializer (Optional) | This class is used by kafka client to deserialize the records | 0.9.0 | -| [hoodie.meta.sync.client.tool.class](#hoodiemetasyncclienttoolclass) | org.apache.hudi.hive.HiveSyncTool (Optional) | Sync tool class name used to sync to metastore. Defaults to Hive. | | -| [hoodie.sql.bulk.insert.enable](#hoodiesqlbulkinsertenable) | false (Optional) | When set to true, the sql insert statement will use bulk insert. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | N/A **(Required)** | Serde properties to hive table.

`Config Param: HIVE_TABLE_SERDE_PROPERTIES` | +| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | N/A **(Required)** | Additional properties to store with table.

`Config Param: HIVE_TABLE_PROPERTIES` | +| [hoodie.datasource.write.partitions.to.delete](#hoodiedatasourcewritepartitionstodelete) | N/A **(Required)** | Comma separated list of partitions to delete. Allows use of wildcard *

`Config Param: PARTITIONS_TO_DELETE` | +| [hoodie.datasource.write.table.name](#hoodiedatasourcewritetablename) | N/A **(Required)** | Table name for the datasource write. Also used to register the table into meta stores.

`Config Param: TABLE_NAME` | +| [hoodie.datasource.compaction.async.enable](#hoodiedatasourcecompactionasyncenable) | true (Optional) | Controls whether async compaction should be turned on for MOR table writing.

`Config Param: ASYNC_COMPACT_ENABLE` | +| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd

`Config Param: HIVE_ASSUME_DATE_PARTITION` | +| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true (Optional) | Auto create hive database if does not exists

`Config Param: HIVE_AUTO_CREATE_DATABASE` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync.

`Config Param: HIVE_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 (Optional) | The number of partitions one batch when synchronous partitions to hive.

`Config Param: HIVE_BATCH_SYNC_PARTITION_NUM` | +| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false (Optional) | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'

`Config Param: HIVE_SYNC_BUCKET_SYNC` | +| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false (Optional) | Whether to sync the table as managed table.

`Config Param: HIVE_CREATE_MANAGED_TABLE` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to.

`Config Param: HIVE_DATABASE` | +| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false (Optional) | Ignore exceptions when syncing with Hive.

`Config Param: HIVE_IGNORE_EXCEPTIONS` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.

`Config Param: HIVE_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns.

`Config Param: HIVE_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive (Optional) | hive password to use

`Config Param: HIVE_PASS` | +| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false (Optional) | Skip the _ro suffix for Read optimized table, when registering

`Config Param: HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE` | +| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false (Optional) | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility.

`Config Param: HIVE_SUPPORT_TIMESTAMP_TYPE` | +| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true (Optional) |

`Config Param: HIVE_SYNC_AS_DATA_SOURCE_TABLE` | +| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false (Optional) | Whether to sync the table column comments while syncing the table.

`Config Param: HIVE_SYNC_COMMENT` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to.

`Config Param: HIVE_TABLE` | +| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true (Optional) | Use JDBC when hive synchronization is enabled

`Config Param: HIVE_USE_JDBC` | +| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false (Optional) | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format

`Config Param: HIVE_USE_PRE_APACHE_INPUT_FORMAT` | +| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive (Optional) | hive user name to use

`Config Param: HIVE_USER` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change.

`Config Param: HIVE_CONDITIONAL_SYNC` | +| [hoodie.datasource.write.commitmeta.key.prefix](#hoodiedatasourcewritecommitmetakeyprefix) | _ (Optional) | Option keys beginning with this prefix, are automatically added to the commit/deltacommit metadata. This is useful to store checkpointing information, in a consistent way with the hudi timeline

`Config Param: COMMIT_METADATA_KEYPREFIX` | +| [hoodie.datasource.write.drop.partition.columns](#hoodiedatasourcewritedroppartitioncolumns) | false (Optional) | When set to true, will not write the partition columns into hudi. By default, false.

`Config Param: DROP_PARTITION_COLUMNS` | +| [hoodie.datasource.write.insert.drop.duplicates](#hoodiedatasourcewriteinsertdropduplicates) | false (Optional) | If set to true, records from the incoming dataframe will not overwrite existing records with the same key during the write operation. This config is deprecated as of 0.14.0. Please use hoodie.datasource.insert.dup.policy instead.

`Config Param: INSERT_DROP_DUPS` | +| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | org.apache.hudi.keygen.SimpleKeyGenerator (Optional) | Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator`

`Config Param: KEYGENERATOR_CLASS_NAME` | +| [hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled](#hoodiedatasourcewritekeygeneratorconsistentlogicaltimestampenabled) | false (Optional) | When set to true, consistent value will be generated for a logical timestamp type column, like timestamp-millis and timestamp-micros, irrespective of whether row-writer is enabled. Disabled by default so as not to break the pipeline that deploy either fully row-writer path or non row-writer path. For example, if it is kept disabled then record key of timestamp type with value `2016-12-29 09:54:00` will be written as timestamp `2016-12-29 09:54:00.0` in row-writer path, while it will be written as long value `1483023240000000` in non row-writer path. If enabled, then the timestamp value will be written in both the cases.

`Config Param: KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED` | +| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false (Optional) | Should we url encode the partition path value, before creating the folder structure.

`Config Param: URL_ENCODE_PARTITIONING` | +| [hoodie.datasource.write.payload.class](#hoodiedatasourcewritepayloadclass) | org.apache.hudi.common.model.OverwriteWithLatestAvroPayload (Optional) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective

`Config Param: PAYLOAD_CLASS_NAME` | +| [hoodie.datasource.write.reconcile.schema](#hoodiedatasourcewritereconcileschema) | false (Optional) | This config controls how writer's schema will be selected based on the incoming batch's schema as well as existing table's one. When schema reconciliation is DISABLED, incoming batch's schema will be picked as a writer-schema (therefore updating table's schema). When schema reconciliation is ENABLED, writer-schema will be picked such that table's schema (after txn) is either kept the same or extended, meaning that we'll always prefer the schema that either adds new columns or stays the same. This enables us, to always extend the table's schema during evolution and never lose the data (when, for ex, existing column is being dropped in a new batch)

`Config Param: RECONCILE_SCHEMA` | +| [hoodie.datasource.write.record.merger.impls](#hoodiedatasourcewriterecordmergerimpls) | org.apache.hudi.common.model.HoodieAvroRecordMerger (Optional) | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These merger impls will filter by hoodie.datasource.write.record.merger.strategy Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)

`Config Param: RECORD_MERGER_IMPLS`
`Since Version: 0.13.0` | +| [hoodie.datasource.write.record.merger.strategy](#hoodiedatasourcewriterecordmergerstrategy) | eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 (Optional) | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in hoodie.datasource.write.record.merger.impls which has the same merger strategy id

`Config Param: RECORD_MERGER_STRATEGY`
`Since Version: 0.13.0` | +| [hoodie.datasource.write.row.writer.enable](#hoodiedatasourcewriterowwriterenable) | true (Optional) | When set to true, will perform write operations directly using the spark native `Row` representation, avoiding any additional conversion costs.

`Config Param: ENABLE_ROW_WRITER` | +| [hoodie.datasource.write.streaming.checkpoint.identifier](#hoodiedatasourcewritestreamingcheckpointidentifier) | default_single_writer (Optional) | A stream identifier used for HUDI to fetch the right checkpoint(`batch id` to be more specific) corresponding this writer. Please note that keep the identifier an unique value for different writer if under multi-writer scenario. If the value is not set, will only keep the checkpoint info in the memory. This could introduce the potential issue that the job is restart(`batch id` is lost) while spark checkpoint write fails, causing spark will retry and rewrite the data.

`Config Param: STREAMING_CHECKPOINT_IDENTIFIER`
`Since Version: 0.13.0` | +| [hoodie.datasource.write.streaming.ignore.failed.batch](#hoodiedatasourcewritestreamingignorefailedbatch) | false (Optional) | Config to indicate whether to ignore any non exception error (e.g. writestatus error) within a streaming microbatch. Turning this on, could hide the write status errors while the spark checkpoint moves ahead.So, would recommend users to use this with caution.

`Config Param: STREAMING_IGNORE_FAILED_BATCH` | +| [hoodie.datasource.write.streaming.retry.count](#hoodiedatasourcewritestreamingretrycount) | 3 (Optional) | Config to indicate how many times streaming job should retry for a failed micro batch.

`Config Param: STREAMING_RETRY_CNT` | +| [hoodie.datasource.write.streaming.retry.interval.ms](#hoodiedatasourcewritestreamingretryintervalms) | 2000 (Optional) | Config to indicate how long (by millisecond) before a retry should issued for failed microbatch

`Config Param: STREAMING_RETRY_INTERVAL_MS` | +| [hoodie.deltastreamer.source.kafka.value.deserializer.class](#hoodiedeltastreamersourcekafkavaluedeserializerclass) | io.confluent.kafka.serializers.KafkaAvroDeserializer (Optional) | This class is used by kafka client to deserialize the records

`Config Param: KAFKA_AVRO_VALUE_DESERIALIZER_CLASS`
`Since Version: 0.9.0` | +| [hoodie.meta.sync.client.tool.class](#hoodiemetasyncclienttoolclass) | org.apache.hudi.hive.HiveSyncTool (Optional) | Sync tool class name used to sync to metastore. Defaults to Hive.

`Config Param: META_SYNC_CLIENT_TOOL_CLASS_NAME` | +| [hoodie.spark.sql.optimized.writes.enable](#hoodiesparksqloptimizedwritesenable) | true (Optional) | Controls whether spark sql prepped update, delete, and merge are enabled.

`Config Param: SPARK_SQL_OPTIMIZED_WRITES`
`Since Version: 0.14.0` | +| [hoodie.sql.bulk.insert.enable](#hoodiesqlbulkinsertenable) | false (Optional) | When set to true, the sql insert statement will use bulk insert. This config is deprecated as of 0.14.0. Please use hoodie.sql.write.operation instead.

`Config Param: SQL_ENABLE_BULK_INSERT` | --- @@ -183,12 +183,12 @@ The following set of configurations help validate new data before commits. [**Advanced Configs**](#PreCommit-Validator-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------- | ----------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.precommit.validators](#hoodieprecommitvalidators) | (Optional) | Comma separated list of class names that can be invoked to validate commit | | -| [hoodie.precommit.validators.equality.sql.queries](#hoodieprecommitvalidatorsequalitysqlqueries) | (Optional) | Spark SQL queries to run on table before committing new data to validate state before and after commit. Multiple queries separated by ';' delimiter are supported. Example: "select count(*) from \<TABLE_NAME\> Note \<TABLE_NAME\> is replaced by table state before and after commit. | | -| [hoodie.precommit.validators.inequality.sql.queries](#hoodieprecommitvalidatorsinequalitysqlqueries) | (Optional) | Spark SQL queries to run on table before committing new data to validate state before and after commit.Multiple queries separated by ';' delimiter are supported.Example query: 'select count(*) from \<TABLE_NAME\> where col=null'Note \<TABLE_NAME\> variable is expected to be present in query. | | -| [hoodie.precommit.validators.single.value.sql.queries](#hoodieprecommitvalidatorssinglevaluesqlqueries) | (Optional) | Spark SQL queries to run on table before committing new data to validate state after commit.Multiple queries separated by ';' delimiter are supported.Expected result is included as part of query separated by '#'. Example query: 'query1#result1:query2#result2'Note \<TABLE_NAME\> variable is expected to be present in query. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------- | ----------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.precommit.validators](#hoodieprecommitvalidators) | (Optional) | Comma separated list of class names that can be invoked to validate commit

`Config Param: VALIDATOR_CLASS_NAMES` | +| [hoodie.precommit.validators.equality.sql.queries](#hoodieprecommitvalidatorsequalitysqlqueries) | (Optional) | Spark SQL queries to run on table before committing new data to validate state before and after commit. Multiple queries separated by ';' delimiter are supported. Example: "select count(*) from \<TABLE_NAME\> Note \<TABLE_NAME\> is replaced by table state before and after commit.

`Config Param: EQUALITY_SQL_QUERIES` | +| [hoodie.precommit.validators.inequality.sql.queries](#hoodieprecommitvalidatorsinequalitysqlqueries) | (Optional) | Spark SQL queries to run on table before committing new data to validate state before and after commit.Multiple queries separated by ';' delimiter are supported.Example query: 'select count(*) from \<TABLE_NAME\> where col=null'Note \<TABLE_NAME\> variable is expected to be present in query.

`Config Param: INEQUALITY_SQL_QUERIES` | +| [hoodie.precommit.validators.single.value.sql.queries](#hoodieprecommitvalidatorssinglevaluesqlqueries) | (Optional) | Spark SQL queries to run on table before committing new data to validate state after commit.Multiple queries separated by ';' delimiter are supported.Expected result is included as part of query separated by '#'. Example query: 'query1#result1:query2#result2'Note \<TABLE_NAME\> variable is expected to be present in query.

`Config Param: SINGLE_VALUE_SQL_QUERIES` | --- ## Flink Sql Configs {#FLINK_SQL} @@ -203,133 +203,133 @@ Flink jobs using the SQL can be configured through the options in WITH clause. T [**Basic Configs**](#Flink-Options-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------ | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.database.name](#hoodiedatabasename) | N/A **(Required)** | Database name to register to Hive metastore | | -| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name to register to Hive metastore | | -| [path](#path) | N/A **(Required)** | Base path for the target hoodie table. The path would be created if it does not exist, otherwise a Hoodie table expects to be initialized successfully | | -| [read.end-commit](#readend-commit) | N/A **(Required)** | End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss' | | -| [read.start-commit](#readstart-commit) | N/A **(Required)** | Start commit instant for reading, the commit time format should be 'yyyyMMddHHmmss', by default reading from the latest instant for streaming read | | -| [archive.max_commits](#archivemax_commits) | 50 (Optional) | Max number of commits to keep before archiving older commits into a sequential log, default 50 | | -| [archive.min_commits](#archivemin_commits) | 40 (Optional) | Min number of commits to keep before archiving older commits into a sequential log, default 40 | | -| [cdc.enabled](#cdcenabled) | false (Optional) | When enable, persist the change data if necessary, and can be queried as a CDC query mode | | -| [cdc.supplemental.logging.mode](#cdcsupplementalloggingmode) | DATA_BEFORE_AFTER (Optional) | Setting 'op_key_only' persists the 'op' and the record key only, setting 'data_before' persists the additional 'before' image, and setting 'data_before_after' persists the additional 'before' and 'after' images. | | -| [changelog.enabled](#changelogenabled) | false (Optional) | Whether to keep all the intermediate changes, we try to keep all the changes of a record when enabled: 1). The sink accept the UPDATE_BEFORE message; 2). The source try to emit every changes of a record. The semantics is best effort because the compaction job would finally merge all changes of a record into one. default false to have UPSERT semantics | | -| [clean.async.enabled](#cleanasyncenabled) | true (Optional) | Whether to cleanup the old commits immediately on new commits, enabled by default | | -| [clean.retain_commits](#cleanretain_commits) | 30 (Optional) | Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much you can incrementally pull on this table, default 30 | | -| [clustering.async.enabled](#clusteringasyncenabled) | false (Optional) | Async Clustering, default false | | -| [clustering.plan.strategy.small.file.limit](#clusteringplanstrategysmallfilelimit) | 600 (Optional) | Files smaller than the size specified here are candidates for clustering, default 600 MB | | -| [clustering.plan.strategy.target.file.max.bytes](#clusteringplanstrategytargetfilemaxbytes) | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB | | -| [compaction.async.enabled](#compactionasyncenabled) | true (Optional) | Async Compaction, enabled by default for MOR | | -| [compaction.delta_commits](#compactiondelta_commits) | 5 (Optional) | Max delta commits needed to trigger compaction, default 5 commits | | -| [hive_sync.enabled](#hive_syncenabled) | false (Optional) | Asynchronously sync Hive meta to HMS, default false | | -| [hive_sync.jdbc_url](#hive_syncjdbc_url) | jdbc:hive2://localhost:10000 (Optional) | Jdbc URL for hive sync, default 'jdbc:hive2://localhost:10000' | | -| [hive_sync.metastore.uris](#hive_syncmetastoreuris) | (Optional) | Metastore uris for hive sync, default '' | | -| [hive_sync.mode](#hive_syncmode) | HMS (Optional) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql, default 'hms' | | -| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot (Optional) | Decides how data files need to be read, in 1) Snapshot mode (obtain latest view, based on row & columnar data); 2) incremental mode (new data since an instantTime); 3) Read Optimized mode (obtain latest view, based on columnar data) .Default: snapshot | | -| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values) | | -| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | (Optional) | Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual value obtained by invoking .toString(), default '' | | -| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | uuid (Optional) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` | | -| [index.type](#indextype) | FLINK_STATE (Optional) | Index type of Flink write job, default is using state backed index. | | -| [metadata.compaction.delta_commits](#metadatacompactiondelta_commits) | 10 (Optional) | Max delta commits for metadata table to trigger compaction, default 10 | | -| [metadata.enabled](#metadataenabled) | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings, default enabled | | -| [precombine.field](#precombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | | -| [read.streaming.enabled](#readstreamingenabled) | false (Optional) | Whether to read as streaming source, default false | | -| [table.type](#tabletype) | COPY_ON_WRITE (Optional) | Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ | | -| [write.operation](#writeoperation) | upsert (Optional) | The write operation, that this write should do | | -| [write.parquet.max.file.size](#writeparquetmaxfilesize) | 120 (Optional) | Target size for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | --------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.database.name](#hoodiedatabasename) | N/A **(Required)** | Database name to register to Hive metastore

`Config Param: DATABASE_NAME` | +| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name to register to Hive metastore

`Config Param: TABLE_NAME` | +| [path](#path) | N/A **(Required)** | Base path for the target hoodie table. The path would be created if it does not exist, otherwise a Hoodie table expects to be initialized successfully

`Config Param: PATH` | +| [read.end-commit](#readend-commit) | N/A **(Required)** | End commit instant for reading, the commit time format should be 'yyyyMMddHHmmss'

`Config Param: READ_END_COMMIT` | +| [read.start-commit](#readstart-commit) | N/A **(Required)** | Start commit instant for reading, the commit time format should be 'yyyyMMddHHmmss', by default reading from the latest instant for streaming read

`Config Param: READ_START_COMMIT` | +| [archive.max_commits](#archivemax_commits) | 50 (Optional) | Max number of commits to keep before archiving older commits into a sequential log, default 50

`Config Param: ARCHIVE_MAX_COMMITS` | +| [archive.min_commits](#archivemin_commits) | 40 (Optional) | Min number of commits to keep before archiving older commits into a sequential log, default 40

`Config Param: ARCHIVE_MIN_COMMITS` | +| [cdc.enabled](#cdcenabled) | false (Optional) | When enable, persist the change data if necessary, and can be queried as a CDC query mode

`Config Param: CDC_ENABLED` | +| [cdc.supplemental.logging.mode](#cdcsupplementalloggingmode) | DATA_BEFORE_AFTER (Optional) | Setting 'op_key_only' persists the 'op' and the record key only, setting 'data_before' persists the additional 'before' image, and setting 'data_before_after' persists the additional 'before' and 'after' images.

`Config Param: SUPPLEMENTAL_LOGGING_MODE` | +| [changelog.enabled](#changelogenabled) | false (Optional) | Whether to keep all the intermediate changes, we try to keep all the changes of a record when enabled: 1). The sink accept the UPDATE_BEFORE message; 2). The source try to emit every changes of a record. The semantics is best effort because the compaction job would finally merge all changes of a record into one. default false to have UPSERT semantics

`Config Param: CHANGELOG_ENABLED` | +| [clean.async.enabled](#cleanasyncenabled) | true (Optional) | Whether to cleanup the old commits immediately on new commits, enabled by default

`Config Param: CLEAN_ASYNC_ENABLED` | +| [clean.retain_commits](#cleanretain_commits) | 30 (Optional) | Number of commits to retain. So data will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much you can incrementally pull on this table, default 30

`Config Param: CLEAN_RETAIN_COMMITS` | +| [clustering.async.enabled](#clusteringasyncenabled) | false (Optional) | Async Clustering, default false

`Config Param: CLUSTERING_ASYNC_ENABLED` | +| [clustering.plan.strategy.small.file.limit](#clusteringplanstrategysmallfilelimit) | 600 (Optional) | Files smaller than the size specified here are candidates for clustering, default 600 MB

`Config Param: CLUSTERING_PLAN_STRATEGY_SMALL_FILE_LIMIT` | +| [clustering.plan.strategy.target.file.max.bytes](#clusteringplanstrategytargetfilemaxbytes) | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups, default 1 GB

`Config Param: CLUSTERING_PLAN_STRATEGY_TARGET_FILE_MAX_BYTES` | +| [compaction.async.enabled](#compactionasyncenabled) | true (Optional) | Async Compaction, enabled by default for MOR

`Config Param: COMPACTION_ASYNC_ENABLED` | +| [compaction.delta_commits](#compactiondelta_commits) | 5 (Optional) | Max delta commits needed to trigger compaction, default 5 commits

`Config Param: COMPACTION_DELTA_COMMITS` | +| [hive_sync.enabled](#hive_syncenabled) | false (Optional) | Asynchronously sync Hive meta to HMS, default false

`Config Param: HIVE_SYNC_ENABLED` | +| [hive_sync.jdbc_url](#hive_syncjdbc_url) | jdbc:hive2://localhost:10000 (Optional) | Jdbc URL for hive sync, default 'jdbc:hive2://localhost:10000'

`Config Param: HIVE_SYNC_JDBC_URL` | +| [hive_sync.metastore.uris](#hive_syncmetastoreuris) | (Optional) | Metastore uris for hive sync, default ''

`Config Param: HIVE_SYNC_METASTORE_URIS` | +| [hive_sync.mode](#hive_syncmode) | HMS (Optional) | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql, default 'hms'

`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.query.type](#hoodiedatasourcequerytype) | snapshot (Optional) | Decides how data files need to be read, in 1) Snapshot mode (obtain latest view, based on row & columnar data); 2) incremental mode (new data since an instantTime); 3) Read Optimized mode (obtain latest view, based on columnar data) .Default: snapshot

`Config Param: QUERY_TYPE` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)

`Config Param: HIVE_STYLE_PARTITIONING` | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | (Optional) | Partition path field. Value to be used at the `partitionPath` component of `HoodieKey`. Actual value obtained by invoking .toString(), default ''

`Config Param: PARTITION_PATH_FIELD` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | uuid (Optional) | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`

`Config Param: RECORD_KEY_FIELD` | +| [index.type](#indextype) | FLINK_STATE (Optional) | Index type of Flink write job, default is using state backed index.

`Config Param: INDEX_TYPE` | +| [metadata.compaction.delta_commits](#metadatacompactiondelta_commits) | 10 (Optional) | Max delta commits for metadata table to trigger compaction, default 10

`Config Param: METADATA_COMPACTION_DELTA_COMMITS` | +| [metadata.enabled](#metadataenabled) | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings, default enabled

`Config Param: METADATA_ENABLED` | +| [precombine.field](#precombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)

`Config Param: PRECOMBINE_FIELD` | +| [read.streaming.enabled](#readstreamingenabled) | false (Optional) | Whether to read as streaming source, default false

`Config Param: READ_AS_STREAMING` | +| [table.type](#tabletype) | COPY_ON_WRITE (Optional) | Type of table to write. COPY_ON_WRITE (or) MERGE_ON_READ

`Config Param: TABLE_TYPE` | +| [write.operation](#writeoperation) | upsert (Optional) | The write operation, that this write should do

`Config Param: OPERATION` | +| [write.parquet.max.file.size](#writeparquetmaxfilesize) | 120 (Optional) | Target size for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.

`Config Param: WRITE_PARQUET_MAX_FILE_SIZE` | [**Advanced Configs**](#Flink-Options-advanced-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [clustering.tasks](#clusteringtasks) | N/A **(Required)** | Parallelism of tasks that do actual clustering, default same as the write task parallelism | | -| [compaction.tasks](#compactiontasks) | N/A **(Required)** | Parallelism of tasks that do actual compaction, default same as the write task parallelism | | -| [hive_sync.conf.dir](#hive_syncconfdir) | N/A **(Required)** | The hive configuration directory, where the hive-site.xml lies in, the file should be put on the client machine | | -| [hive_sync.serde_properties](#hive_syncserde_properties) | N/A **(Required)** | Serde properties to hive table, the data format is k1=v1 k2=v2 | | -| [hive_sync.table_properties](#hive_synctable_properties) | N/A **(Required)** | Additional properties to store with table, the data format is k1=v1 k2=v2 | | -| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | N/A **(Required)** | Key generator class, that implements will extract the key out of incoming record | | -| [read.tasks](#readtasks) | N/A **(Required)** | Parallelism of tasks that do actual read, default is the parallelism of the execution environment | | -| [source.avro-schema](#sourceavro-schema) | N/A **(Required)** | Source avro schema string, the parsed schema is used for deserialization | | -| [source.avro-schema.path](#sourceavro-schemapath) | N/A **(Required)** | Source avro schema file path, the parsed schema is used for deserialization | | -| [write.bucket_assign.tasks](#writebucket_assigntasks) | N/A **(Required)** | Parallelism of tasks that do bucket assign, default same as the write task parallelism | | -| [write.index_bootstrap.tasks](#writeindex_bootstraptasks) | N/A **(Required)** | Parallelism of tasks that do index bootstrap, default same as the write task parallelism | | -| [write.partition.format](#writepartitionformat) | N/A **(Required)** | Partition path format, only valid when 'write.datetime.partitioning' is true, default is: 1) 'yyyyMMddHH' for timestamp(3) WITHOUT TIME ZONE, LONG, FLOAT, DOUBLE, DECIMAL; 2) 'yyyyMMdd' for DATE and INT. | | -| [write.tasks](#writetasks) | N/A **(Required)** | Parallelism of tasks that do actual write, default is the parallelism of the execution environment | | -| [clean.policy](#cleanpolicy) | KEEP_LATEST_COMMITS (Optional) | Clean policy to manage the Hudi table. Available option: KEEP_LATEST_COMMITS, KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_BY_HOURS.Default is KEEP_LATEST_COMMITS. | | -| [clean.retain_file_versions](#cleanretain_file_versions) | 5 (Optional) | Number of file versions to retain. default 5 | | -| [clean.retain_hours](#cleanretain_hours) | 24 (Optional) | Number of hours for which commits need to be retained. This config provides a more flexible option ascompared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group, corresponding to commits with commit times older than the configured number of hours to be retained are cleaned. | | -| [clustering.delta_commits](#clusteringdelta_commits) | 4 (Optional) | Max delta commits needed to trigger clustering, default 4 commits | | -| [clustering.plan.partition.filter.mode](#clusteringplanpartitionfiltermode) | NONE (Optional) | Partition filter mode used in the creation of clustering plan. Available values are - NONE: do not filter table partition and thus the clustering plan will include all partitions that have clustering candidate.RECENT_DAYS: keep a continuous range of partitions, worked together with configs 'clustering.plan.strategy.daybased.lookback.partitions' and 'clustering.plan.strategy.daybased.skipfromlatest.partitions.SELECTED_PARTITIONS: keep partitions that are in the specified range ['clustering.plan.strategy.cluster.begin.partition', 'clustering.plan.strategy.cluster.end.partition'].DAY_ROLLING: clustering partitions on a rolling basis by the hour to avoid clustering all partitions each time, which strategy sorts the partitions asc and chooses the partition of which index is divided by 24 and the remainder is equal to the current hour. | | -| [clustering.plan.strategy.class](#clusteringplanstrategyclass) | org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy (Optional) | Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan i.e select what file groups are being clustered. Default strategy, looks at the last N (determined by clustering.plan.strategy.daybased.lookback.partitions) day based partitions picks the small file slices within those partitions. | | -| [clustering.plan.strategy.cluster.begin.partition](#clusteringplanstrategyclusterbeginpartition) | (Optional) | Begin partition used to filter partition (inclusive) | | -| [clustering.plan.strategy.cluster.end.partition](#clusteringplanstrategyclusterendpartition) | (Optional) | End partition used to filter partition (inclusive) | | -| [clustering.plan.strategy.daybased.lookback.partitions](#clusteringplanstrategydaybasedlookbackpartitions) | 2 (Optional) | Number of partitions to list to create ClusteringPlan, default is 2 | | -| [clustering.plan.strategy.daybased.skipfromlatest.partitions](#clusteringplanstrategydaybasedskipfromlatestpartitions) | 0 (Optional) | Number of partitions to skip from latest when choosing partitions to create ClusteringPlan | | -| [clustering.plan.strategy.max.num.groups](#clusteringplanstrategymaxnumgroups) | 30 (Optional) | Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism, default is 30 | | -| [clustering.plan.strategy.partition.regex.pattern](#clusteringplanstrategypartitionregexpattern) | (Optional) | Filter clustering partitions that matched regex pattern | | -| [clustering.plan.strategy.partition.selected](#clusteringplanstrategypartitionselected) | (Optional) | Partitions to run clustering | | -| [clustering.plan.strategy.sort.columns](#clusteringplanstrategysortcolumns) | (Optional) | Columns to sort the data by when clustering | | -| [clustering.schedule.enabled](#clusteringscheduleenabled) | false (Optional) | Schedule the cluster plan, default false | | -| [compaction.delta_seconds](#compactiondelta_seconds) | 3600 (Optional) | Max delta seconds time needed to trigger compaction, default 1 hour | | -| [compaction.max_memory](#compactionmax_memory) | 100 (Optional) | Max memory in MB for compaction spillable map, default 100MB | | -| [compaction.schedule.enabled](#compactionscheduleenabled) | true (Optional) | Schedule the compaction plan, enabled by default for MOR | | -| [compaction.target_io](#compactiontarget_io) | 512000 (Optional) | Target IO in MB for per compaction (both read and write), default 500 GB | | -| [compaction.timeout.seconds](#compactiontimeoutseconds) | 1200 (Optional) | Max timeout time in seconds for online compaction to rollback, default 20 minutes | | -| [compaction.trigger.strategy](#compactiontriggerstrategy) | num_commits (Optional) | Strategy to trigger compaction, options are 'num_commits': trigger compaction when reach N delta commits; 'time_elapsed': trigger compaction when time elapsed > N seconds since last compaction; 'num_and_time': trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied; 'num_or_time': trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied. Default is 'num_commits' | | -| [hive_sync.assume_date_partitioning](#hive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/mm/dd, default false | | -| [hive_sync.auto_create_db](#hive_syncauto_create_db) | true (Optional) | Auto create hive database if it does not exists, default true | | -| [hive_sync.db](#hive_syncdb) | default (Optional) | Database name for hive sync, default 'default' | | -| [hive_sync.file_format](#hive_syncfile_format) | PARQUET (Optional) | File format for hive sync, default 'PARQUET' | | -| [hive_sync.ignore_exceptions](#hive_syncignore_exceptions) | false (Optional) | Ignore exceptions during hive synchronization, default false | | -| [hive_sync.partition_extractor_class](#hive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Tool to extract the partition value from HDFS path, default 'MultiPartKeysValueExtractor' | | -| [hive_sync.partition_fields](#hive_syncpartition_fields) | (Optional) | Partition fields for hive sync, default '' | | -| [hive_sync.password](#hive_syncpassword) | hive (Optional) | Password for hive sync, default 'hive' | | -| [hive_sync.skip_ro_suffix](#hive_syncskip_ro_suffix) | false (Optional) | Skip the _ro suffix for Read optimized table when registering, default false | | -| [hive_sync.support_timestamp](#hive_syncsupport_timestamp) | true (Optional) | INT64 with original type TIMESTAMP_MICROS is converted to hive timestamp type. Disabled by default for backward compatibility. | | -| [hive_sync.table](#hive_synctable) | unknown (Optional) | Table name for hive sync, default 'unknown' | | -| [hive_sync.table.strategy](#hive_synctablestrategy) | ALL (Optional) | Hive table synchronization strategy. Available option: RO, RT, ALL. | | -| [hive_sync.use_jdbc](#hive_syncuse_jdbc) | true (Optional) | Use JDBC when hive synchronization is enabled, default true | | -| [hive_sync.username](#hive_syncusername) | hive (Optional) | Username for hive sync, default 'hive' | | -| [hoodie.bucket.index.hash.field](#hoodiebucketindexhashfield) | (Optional) | Index key field. Value to be used as hashing to find the bucket ID. Should be a subset of or equal to the recordKey fields. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` | | -| [hoodie.bucket.index.num.buckets](#hoodiebucketindexnumbuckets) | 4 (Optional) | Hudi bucket number per partition. Only affected if using Hudi bucket index. | | -| [hoodie.datasource.merge.type](#hoodiedatasourcemergetype) | payload_combine (Optional) | For Snapshot query on merge on read table. Use this key to define how the payloads are merged, in 1) skip_merge: read the base file records plus the log file records; 2) payload_combine: read the base file records first, for each record in base file, checks whether the key is in the log file records(combines the two records with same key for base and log file records), then read the left log file records | | -| [hoodie.datasource.write.keygenerator.type](#hoodiedatasourcewritekeygeneratortype) | SIMPLE (Optional) | Key generator type, that implements will extract the key out of incoming record. **Note** This is being actively worked on. Please use `hoodie.datasource.write.keygenerator.class` instead. | | -| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false (Optional) | Whether to encode the partition path url, default false | | -| [hoodie.index.bucket.engine](#hoodieindexbucketengine) | SIMPLE (Optional) | Type of bucket index engine. Available options: [SIMPLE | CONSISTENT_HASHING] | | -| [index.bootstrap.enabled](#indexbootstrapenabled) | false (Optional) | Whether to bootstrap the index state from existing hoodie table, default false | | -| [index.global.enabled](#indexglobalenabled) | true (Optional) | Whether to update index for the old partition path if same key record with different partition path came in, default true | | -| [index.partition.regex](#indexpartitionregex) | .* (Optional) | Whether to load partitions in state if partition path matching, default `*` | | -| [index.state.ttl](#indexstatettl) | 0.0 (Optional) | Index state ttl in days, default stores the index permanently | | -| [partition.default_name](#partitiondefault_name) | __HIVE_DEFAULT_PARTITION__ (Optional) | The default partition name in case the dynamic partition column value is null/empty string | | -| [payload.class](#payloadclass) | org.apache.hudi.common.model.EventTimeAvroPayload (Optional) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for the option in-effective | | -| [read.data.skipping.enabled](#readdataskippingenabled) | false (Optional) | Enables data-skipping allowing queries to leverage indexes to reduce the search space byskipping over files | | -| [read.streaming.check-interval](#readstreamingcheck-interval) | 60 (Optional) | Check interval for streaming read of SECOND, default 1 minute | | -| [read.streaming.skip_clustering](#readstreamingskip_clustering) | false (Optional) | Whether to skip clustering instants to avoid reading base files of clustering operations for streaming read to improve read performance. | | -| [read.streaming.skip_compaction](#readstreamingskip_compaction) | false (Optional) | Whether to skip compaction instants and avoid reading compacted base files for streaming read to improve read performance. This option can be used to avoid reading duplicates when changelog mode is enabled, it is a solution to keep data integrity | | -| [read.utc-timezone](#readutc-timezone) | true (Optional) | Use UTC timezone or local timezone to the conversion between epoch time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x use UTC timezone, by default true | | -| [record.merger.impls](#recordmergerimpls) | org.apache.hudi.common.model.HoodieAvroRecordMerger (Optional) | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These merger impls will filter by record.merger.strategy. Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc) | | -| [record.merger.strategy](#recordmergerstrategy) | eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 (Optional) | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in record.merger.impls which has the same merger strategy id | | -| [write.batch.size](#writebatchsize) | 256.0 (Optional) | Batch buffer size in MB to flush data into the underneath filesystem, default 256MB | | -| [write.bulk_insert.shuffle_input](#writebulk_insertshuffle_input) | true (Optional) | Whether to shuffle the inputs by specific fields for bulk insert tasks, default true | | -| [write.bulk_insert.sort_input](#writebulk_insertsort_input) | true (Optional) | Whether to sort the inputs by specific fields for bulk insert tasks, default true | | -| [write.bulk_insert.sort_input.by_record_key](#writebulk_insertsort_inputby_record_key) | false (Optional) | Whether to sort the inputs by record keys for bulk insert tasks, default false | | -| [write.client.id](#writeclientid) | (Optional) | Unique identifier used to distinguish different writer pipelines for concurrent mode | | -| [write.commit.ack.timeout](#writecommitacktimeout) | -1 (Optional) | Timeout limit for a writer task after it finishes a checkpoint and waits for the instant commit success, only for internal use | | -| [write.ignore.failed](#writeignorefailed) | false (Optional) | Flag to indicate whether to ignore any non exception error (e.g. writestatus error). within a checkpoint batch. By default false. Turning this on, could hide the write status errors while the flink checkpoint moves ahead. So, would recommend users to use this with caution. | | -| [write.insert.cluster](#writeinsertcluster) | false (Optional) | Whether to merge small files for insert mode, if true, the write throughput will decrease because the read/write of existing small file, only valid for COW table, default false | | -| [write.log.max.size](#writelogmaxsize) | 1024 (Optional) | Maximum size allowed in MB for a log file before it is rolled over to the next version, default 1GB | | -| [write.log_block.size](#writelog_blocksize) | 128 (Optional) | Max log block size in MB for log file, default 128MB | | -| [write.merge.max_memory](#writemergemax_memory) | 100 (Optional) | Max memory in MB for merge, default 100MB | | -| [write.parquet.block.size](#writeparquetblocksize) | 120 (Optional) | Parquet RowGroup size. It's recommended to make this large enough that scan costs can be amortized by packing enough column values into a single row group. | | -| [write.parquet.page.size](#writeparquetpagesize) | 1 (Optional) | Parquet page size. Page is the unit of read within a parquet file. Within a block, pages are compressed separately. | | -| [write.precombine](#writeprecombine) | false (Optional) | Flag to indicate whether to drop duplicates before insert/upsert. By default these cases will accept duplicates, to gain extra performance: 1) insert operation; 2) upsert for MOR table, the MOR table deduplicate on reading | | -| [write.rate.limit](#writeratelimit) | 0 (Optional) | Write record rate limit per second to prevent traffic jitter and improve stability, default 0 (no limit) | | -| [write.retry.interval.ms](#writeretryintervalms) | 2000 (Optional) | Flag to indicate how long (by millisecond) before a retry should issued for failed checkpoint batch. By default 2000 and it will be doubled by every retry | | -| [write.retry.times](#writeretrytimes) | 3 (Optional) | Flag to indicate how many times streaming job should retry for a failed checkpoint batch. By default 3 | | -| [write.sort.memory](#writesortmemory) | 128 (Optional) | Sort memory in MB, default 128MB | | -| [write.task.max.size](#writetaskmaxsize) | 1024.0 (Optional) | Maximum memory in MB for a write task, when the threshold hits, it flushes the max size data bucket to avoid OOM, default 1GB | | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [clustering.tasks](#clusteringtasks) | N/A **(Required)** | Parallelism of tasks that do actual clustering, default same as the write task parallelism

`Config Param: CLUSTERING_TASKS` | +| [compaction.tasks](#compactiontasks) | N/A **(Required)** | Parallelism of tasks that do actual compaction, default same as the write task parallelism

`Config Param: COMPACTION_TASKS` | +| [hive_sync.conf.dir](#hive_syncconfdir) | N/A **(Required)** | The hive configuration directory, where the hive-site.xml lies in, the file should be put on the client machine

`Config Param: HIVE_SYNC_CONF_DIR` | +| [hive_sync.serde_properties](#hive_syncserde_properties) | N/A **(Required)** | Serde properties to hive table, the data format is k1=v1 k2=v2

`Config Param: HIVE_SYNC_TABLE_SERDE_PROPERTIES` | +| [hive_sync.table_properties](#hive_synctable_properties) | N/A **(Required)** | Additional properties to store with table, the data format is k1=v1 k2=v2

`Config Param: HIVE_SYNC_TABLE_PROPERTIES` | +| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | N/A **(Required)** | Key generator class, that implements will extract the key out of incoming record

`Config Param: KEYGEN_CLASS_NAME` | +| [read.tasks](#readtasks) | N/A **(Required)** | Parallelism of tasks that do actual read, default is the parallelism of the execution environment

`Config Param: READ_TASKS` | +| [source.avro-schema](#sourceavro-schema) | N/A **(Required)** | Source avro schema string, the parsed schema is used for deserialization

`Config Param: SOURCE_AVRO_SCHEMA` | +| [source.avro-schema.path](#sourceavro-schemapath) | N/A **(Required)** | Source avro schema file path, the parsed schema is used for deserialization

`Config Param: SOURCE_AVRO_SCHEMA_PATH` | +| [write.bucket_assign.tasks](#writebucket_assigntasks) | N/A **(Required)** | Parallelism of tasks that do bucket assign, default same as the write task parallelism

`Config Param: BUCKET_ASSIGN_TASKS` | +| [write.index_bootstrap.tasks](#writeindex_bootstraptasks) | N/A **(Required)** | Parallelism of tasks that do index bootstrap, default same as the write task parallelism

`Config Param: INDEX_BOOTSTRAP_TASKS` | +| [write.partition.format](#writepartitionformat) | N/A **(Required)** | Partition path format, only valid when 'write.datetime.partitioning' is true, default is: 1) 'yyyyMMddHH' for timestamp(3) WITHOUT TIME ZONE, LONG, FLOAT, DOUBLE, DECIMAL; 2) 'yyyyMMdd' for DATE and INT.

`Config Param: PARTITION_FORMAT` | +| [write.tasks](#writetasks) | N/A **(Required)** | Parallelism of tasks that do actual write, default is the parallelism of the execution environment

`Config Param: WRITE_TASKS` | +| [clean.policy](#cleanpolicy) | KEEP_LATEST_COMMITS (Optional) | Clean policy to manage the Hudi table. Available option: KEEP_LATEST_COMMITS, KEEP_LATEST_FILE_VERSIONS, KEEP_LATEST_BY_HOURS.Default is KEEP_LATEST_COMMITS.

`Config Param: CLEAN_POLICY` | +| [clean.retain_file_versions](#cleanretain_file_versions) | 5 (Optional) | Number of file versions to retain. default 5

`Config Param: CLEAN_RETAIN_FILE_VERSIONS` | +| [clean.retain_hours](#cleanretain_hours) | 24 (Optional) | Number of hours for which commits need to be retained. This config provides a more flexible option ascompared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group, corresponding to commits with commit times older than the configured number of hours to be retained are cleaned.

`Config Param: CLEAN_RETAIN_HOURS` | +| [clustering.delta_commits](#clusteringdelta_commits) | 4 (Optional) | Max delta commits needed to trigger clustering, default 4 commits

`Config Param: CLUSTERING_DELTA_COMMITS` | +| [clustering.plan.partition.filter.mode](#clusteringplanpartitionfiltermode) | NONE (Optional) | Partition filter mode used in the creation of clustering plan. Available values are - NONE: do not filter table partition and thus the clustering plan will include all partitions that have clustering candidate.RECENT_DAYS: keep a continuous range of partitions, worked together with configs 'clustering.plan.strategy.daybased.lookback.partitions' and 'clustering.plan.strategy.daybased.skipfromlatest.partitions.SELECTED_PARTITIONS: keep partitions that are in the specified range ['clustering.plan.strategy.cluster.begin.partition', 'clustering.plan.strategy.cluster.end.partition'].DAY_ROLLING: clustering partitions on a rolling basis by the hour to avoid clustering all partitions each time, which strategy sorts the partitions asc and chooses the partition of which index is divided by 24 and the remainder is equal to the current hour.

`Config Param: CLUSTERING_PLAN_PARTITION_FILTER_MODE_NAME` | +| [clustering.plan.strategy.class](#clusteringplanstrategyclass) | org.apache.hudi.client.clustering.plan.strategy.FlinkSizeBasedClusteringPlanStrategy (Optional) | Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan i.e select what file groups are being clustered. Default strategy, looks at the last N (determined by clustering.plan.strategy.daybased.lookback.partitions) day based partitions picks the small file slices within those partitions.

`Config Param: CLUSTERING_PLAN_STRATEGY_CLASS` | +| [clustering.plan.strategy.cluster.begin.partition](#clusteringplanstrategyclusterbeginpartition) | (Optional) | Begin partition used to filter partition (inclusive)

`Config Param: CLUSTERING_PLAN_STRATEGY_CLUSTER_BEGIN_PARTITION` | +| [clustering.plan.strategy.cluster.end.partition](#clusteringplanstrategyclusterendpartition) | (Optional) | End partition used to filter partition (inclusive)

`Config Param: CLUSTERING_PLAN_STRATEGY_CLUSTER_END_PARTITION` | +| [clustering.plan.strategy.daybased.lookback.partitions](#clusteringplanstrategydaybasedlookbackpartitions) | 2 (Optional) | Number of partitions to list to create ClusteringPlan, default is 2

`Config Param: CLUSTERING_TARGET_PARTITIONS` | +| [clustering.plan.strategy.daybased.skipfromlatest.partitions](#clusteringplanstrategydaybasedskipfromlatestpartitions) | 0 (Optional) | Number of partitions to skip from latest when choosing partitions to create ClusteringPlan

`Config Param: CLUSTERING_PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST` | +| [clustering.plan.strategy.max.num.groups](#clusteringplanstrategymaxnumgroups) | 30 (Optional) | Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism, default is 30

`Config Param: CLUSTERING_MAX_NUM_GROUPS` | +| [clustering.plan.strategy.partition.regex.pattern](#clusteringplanstrategypartitionregexpattern) | (Optional) | Filter clustering partitions that matched regex pattern

`Config Param: CLUSTERING_PLAN_STRATEGY_PARTITION_REGEX_PATTERN` | +| [clustering.plan.strategy.partition.selected](#clusteringplanstrategypartitionselected) | (Optional) | Partitions to run clustering

`Config Param: CLUSTERING_PLAN_STRATEGY_PARTITION_SELECTED` | +| [clustering.plan.strategy.sort.columns](#clusteringplanstrategysortcolumns) | (Optional) | Columns to sort the data by when clustering

`Config Param: CLUSTERING_SORT_COLUMNS` | +| [clustering.schedule.enabled](#clusteringscheduleenabled) | false (Optional) | Schedule the cluster plan, default false

`Config Param: CLUSTERING_SCHEDULE_ENABLED` | +| [compaction.delta_seconds](#compactiondelta_seconds) | 3600 (Optional) | Max delta seconds time needed to trigger compaction, default 1 hour

`Config Param: COMPACTION_DELTA_SECONDS` | +| [compaction.max_memory](#compactionmax_memory) | 100 (Optional) | Max memory in MB for compaction spillable map, default 100MB

`Config Param: COMPACTION_MAX_MEMORY` | +| [compaction.schedule.enabled](#compactionscheduleenabled) | true (Optional) | Schedule the compaction plan, enabled by default for MOR

`Config Param: COMPACTION_SCHEDULE_ENABLED` | +| [compaction.target_io](#compactiontarget_io) | 512000 (Optional) | Target IO in MB for per compaction (both read and write), default 500 GB

`Config Param: COMPACTION_TARGET_IO` | +| [compaction.timeout.seconds](#compactiontimeoutseconds) | 1200 (Optional) | Max timeout time in seconds for online compaction to rollback, default 20 minutes

`Config Param: COMPACTION_TIMEOUT_SECONDS` | +| [compaction.trigger.strategy](#compactiontriggerstrategy) | num_commits (Optional) | Strategy to trigger compaction, options are 'num_commits': trigger compaction when reach N delta commits; 'time_elapsed': trigger compaction when time elapsed > N seconds since last compaction; 'num_and_time': trigger compaction when both NUM_COMMITS and TIME_ELAPSED are satisfied; 'num_or_time': trigger compaction when NUM_COMMITS or TIME_ELAPSED is satisfied. Default is 'num_commits'

`Config Param: COMPACTION_TRIGGER_STRATEGY` | +| [hive_sync.assume_date_partitioning](#hive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/mm/dd, default false

`Config Param: HIVE_SYNC_ASSUME_DATE_PARTITION` | +| [hive_sync.auto_create_db](#hive_syncauto_create_db) | true (Optional) | Auto create hive database if it does not exists, default true

`Config Param: HIVE_SYNC_AUTO_CREATE_DB` | +| [hive_sync.db](#hive_syncdb) | default (Optional) | Database name for hive sync, default 'default'

`Config Param: HIVE_SYNC_DB` | +| [hive_sync.file_format](#hive_syncfile_format) | PARQUET (Optional) | File format for hive sync, default 'PARQUET'

`Config Param: HIVE_SYNC_FILE_FORMAT` | +| [hive_sync.ignore_exceptions](#hive_syncignore_exceptions) | false (Optional) | Ignore exceptions during hive synchronization, default false

`Config Param: HIVE_SYNC_IGNORE_EXCEPTIONS` | +| [hive_sync.partition_extractor_class](#hive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Tool to extract the partition value from HDFS path, default 'MultiPartKeysValueExtractor'

`Config Param: HIVE_SYNC_PARTITION_EXTRACTOR_CLASS_NAME` | +| [hive_sync.partition_fields](#hive_syncpartition_fields) | (Optional) | Partition fields for hive sync, default ''

`Config Param: HIVE_SYNC_PARTITION_FIELDS` | +| [hive_sync.password](#hive_syncpassword) | hive (Optional) | Password for hive sync, default 'hive'

`Config Param: HIVE_SYNC_PASSWORD` | +| [hive_sync.skip_ro_suffix](#hive_syncskip_ro_suffix) | false (Optional) | Skip the _ro suffix for Read optimized table when registering, default false

`Config Param: HIVE_SYNC_SKIP_RO_SUFFIX` | +| [hive_sync.support_timestamp](#hive_syncsupport_timestamp) | true (Optional) | INT64 with original type TIMESTAMP_MICROS is converted to hive timestamp type. Disabled by default for backward compatibility.

`Config Param: HIVE_SYNC_SUPPORT_TIMESTAMP` | +| [hive_sync.table](#hive_synctable) | unknown (Optional) | Table name for hive sync, default 'unknown'

`Config Param: HIVE_SYNC_TABLE` | +| [hive_sync.table.strategy](#hive_synctablestrategy) | ALL (Optional) | Hive table synchronization strategy. Available option: RO, RT, ALL.

`Config Param: HIVE_SYNC_TABLE_STRATEGY` | +| [hive_sync.use_jdbc](#hive_syncuse_jdbc) | true (Optional) | Use JDBC when hive synchronization is enabled, default true

`Config Param: HIVE_SYNC_USE_JDBC` | +| [hive_sync.username](#hive_syncusername) | hive (Optional) | Username for hive sync, default 'hive'

`Config Param: HIVE_SYNC_USERNAME` | +| [hoodie.bucket.index.hash.field](#hoodiebucketindexhashfield) | (Optional) | Index key field. Value to be used as hashing to find the bucket ID. Should be a subset of or equal to the recordKey fields. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`

`Config Param: INDEX_KEY_FIELD` | +| [hoodie.bucket.index.num.buckets](#hoodiebucketindexnumbuckets) | 4 (Optional) | Hudi bucket number per partition. Only affected if using Hudi bucket index.

`Config Param: BUCKET_INDEX_NUM_BUCKETS` | +| [hoodie.datasource.merge.type](#hoodiedatasourcemergetype) | payload_combine (Optional) | For Snapshot query on merge on read table. Use this key to define how the payloads are merged, in 1) skip_merge: read the base file records plus the log file records; 2) payload_combine: read the base file records first, for each record in base file, checks whether the key is in the log file records(combines the two records with same key for base and log file records), then read the left log file records

`Config Param: MERGE_TYPE` | +| [hoodie.datasource.write.keygenerator.type](#hoodiedatasourcewritekeygeneratortype) | SIMPLE (Optional) | Key generator type, that implements will extract the key out of incoming record. **Note** This is being actively worked on. Please use `hoodie.datasource.write.keygenerator.class` instead.

`Config Param: KEYGEN_TYPE` | +| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false (Optional) | Whether to encode the partition path url, default false

`Config Param: URL_ENCODE_PARTITIONING` | +| [hoodie.index.bucket.engine](#hoodieindexbucketengine) | SIMPLE (Optional) | Type of bucket index engine. Available options: [SIMPLE | CONSISTENT_HASHING]

`Config Param: BUCKET_INDEX_ENGINE_TYPE` | +| [index.bootstrap.enabled](#indexbootstrapenabled) | false (Optional) | Whether to bootstrap the index state from existing hoodie table, default false

`Config Param: INDEX_BOOTSTRAP_ENABLED` | +| [index.global.enabled](#indexglobalenabled) | true (Optional) | Whether to update index for the old partition path if same key record with different partition path came in, default true

`Config Param: INDEX_GLOBAL_ENABLED` | +| [index.partition.regex](#indexpartitionregex) | .* (Optional) | Whether to load partitions in state if partition path matching, default `*`

`Config Param: INDEX_PARTITION_REGEX` | +| [index.state.ttl](#indexstatettl) | 0.0 (Optional) | Index state ttl in days, default stores the index permanently

`Config Param: INDEX_STATE_TTL` | +| [partition.default_name](#partitiondefault_name) | __HIVE_DEFAULT_PARTITION__ (Optional) | The default partition name in case the dynamic partition column value is null/empty string

`Config Param: PARTITION_DEFAULT_NAME` | +| [payload.class](#payloadclass) | org.apache.hudi.common.model.EventTimeAvroPayload (Optional) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for the option in-effective

`Config Param: PAYLOAD_CLASS_NAME` | +| [read.data.skipping.enabled](#readdataskippingenabled) | false (Optional) | Enables data-skipping allowing queries to leverage indexes to reduce the search space byskipping over files

`Config Param: READ_DATA_SKIPPING_ENABLED` | +| [read.streaming.check-interval](#readstreamingcheck-interval) | 60 (Optional) | Check interval for streaming read of SECOND, default 1 minute

`Config Param: READ_STREAMING_CHECK_INTERVAL` | +| [read.streaming.skip_clustering](#readstreamingskip_clustering) | false (Optional) | Whether to skip clustering instants to avoid reading base files of clustering operations for streaming read to improve read performance.

`Config Param: READ_STREAMING_SKIP_CLUSTERING` | +| [read.streaming.skip_compaction](#readstreamingskip_compaction) | false (Optional) | Whether to skip compaction instants and avoid reading compacted base files for streaming read to improve read performance. This option can be used to avoid reading duplicates when changelog mode is enabled, it is a solution to keep data integrity

`Config Param: READ_STREAMING_SKIP_COMPACT` | +| [read.utc-timezone](#readutc-timezone) | true (Optional) | Use UTC timezone or local timezone to the conversion between epoch time and LocalDateTime. Hive 0.x/1.x/2.x use local timezone. But Hive 3.x use UTC timezone, by default true

`Config Param: UTC_TIMEZONE` | +| [record.merger.impls](#recordmergerimpls) | org.apache.hudi.common.model.HoodieAvroRecordMerger (Optional) | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These merger impls will filter by record.merger.strategy. Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)

`Config Param: RECORD_MERGER_IMPLS` | +| [record.merger.strategy](#recordmergerstrategy) | eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 (Optional) | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in record.merger.impls which has the same merger strategy id

`Config Param: RECORD_MERGER_STRATEGY` | +| [write.batch.size](#writebatchsize) | 256.0 (Optional) | Batch buffer size in MB to flush data into the underneath filesystem, default 256MB

`Config Param: WRITE_BATCH_SIZE` | +| [write.bulk_insert.shuffle_input](#writebulk_insertshuffle_input) | true (Optional) | Whether to shuffle the inputs by specific fields for bulk insert tasks, default true

`Config Param: WRITE_BULK_INSERT_SHUFFLE_INPUT` | +| [write.bulk_insert.sort_input](#writebulk_insertsort_input) | true (Optional) | Whether to sort the inputs by specific fields for bulk insert tasks, default true

`Config Param: WRITE_BULK_INSERT_SORT_INPUT` | +| [write.bulk_insert.sort_input.by_record_key](#writebulk_insertsort_inputby_record_key) | false (Optional) | Whether to sort the inputs by record keys for bulk insert tasks, default false

`Config Param: WRITE_BULK_INSERT_SORT_INPUT_BY_RECORD_KEY` | +| [write.client.id](#writeclientid) | (Optional) | Unique identifier used to distinguish different writer pipelines for concurrent mode

`Config Param: WRITE_CLIENT_ID` | +| [write.commit.ack.timeout](#writecommitacktimeout) | -1 (Optional) | Timeout limit for a writer task after it finishes a checkpoint and waits for the instant commit success, only for internal use

`Config Param: WRITE_COMMIT_ACK_TIMEOUT` | +| [write.ignore.failed](#writeignorefailed) | false (Optional) | Flag to indicate whether to ignore any non exception error (e.g. writestatus error). within a checkpoint batch. By default false. Turning this on, could hide the write status errors while the flink checkpoint moves ahead. So, would recommend users to use this with caution.

`Config Param: IGNORE_FAILED` | +| [write.insert.cluster](#writeinsertcluster) | false (Optional) | Whether to merge small files for insert mode, if true, the write throughput will decrease because the read/write of existing small file, only valid for COW table, default false

`Config Param: INSERT_CLUSTER` | +| [write.log.max.size](#writelogmaxsize) | 1024 (Optional) | Maximum size allowed in MB for a log file before it is rolled over to the next version, default 1GB

`Config Param: WRITE_LOG_MAX_SIZE` | +| [write.log_block.size](#writelog_blocksize) | 128 (Optional) | Max log block size in MB for log file, default 128MB

`Config Param: WRITE_LOG_BLOCK_SIZE` | +| [write.merge.max_memory](#writemergemax_memory) | 100 (Optional) | Max memory in MB for merge, default 100MB

`Config Param: WRITE_MERGE_MAX_MEMORY` | +| [write.parquet.block.size](#writeparquetblocksize) | 120 (Optional) | Parquet RowGroup size. It's recommended to make this large enough that scan costs can be amortized by packing enough column values into a single row group.

`Config Param: WRITE_PARQUET_BLOCK_SIZE` | +| [write.parquet.page.size](#writeparquetpagesize) | 1 (Optional) | Parquet page size. Page is the unit of read within a parquet file. Within a block, pages are compressed separately.

`Config Param: WRITE_PARQUET_PAGE_SIZE` | +| [write.precombine](#writeprecombine) | false (Optional) | Flag to indicate whether to drop duplicates before insert/upsert. By default these cases will accept duplicates, to gain extra performance: 1) insert operation; 2) upsert for MOR table, the MOR table deduplicate on reading

`Config Param: PRE_COMBINE` | +| [write.rate.limit](#writeratelimit) | 0 (Optional) | Write record rate limit per second to prevent traffic jitter and improve stability, default 0 (no limit)

`Config Param: WRITE_RATE_LIMIT` | +| [write.retry.interval.ms](#writeretryintervalms) | 2000 (Optional) | Flag to indicate how long (by millisecond) before a retry should issued for failed checkpoint batch. By default 2000 and it will be doubled by every retry

`Config Param: RETRY_INTERVAL_MS` | +| [write.retry.times](#writeretrytimes) | 3 (Optional) | Flag to indicate how many times streaming job should retry for a failed checkpoint batch. By default 3

`Config Param: RETRY_TIMES` | +| [write.sort.memory](#writesortmemory) | 128 (Optional) | Sort memory in MB, default 128MB

`Config Param: WRITE_SORT_MEMORY` | +| [write.task.max.size](#writetaskmaxsize) | 1024.0 (Optional) | Maximum memory in MB for a write task, when the threshold hits, it flushes the max size data bucket to avoid OOM, default 1GB

`Config Param: WRITE_TASK_MAX_SIZE` | --- ## Write Client Configs {#WRITE_CLIENT} @@ -344,15 +344,15 @@ The following set of configurations are common across Hudi. [**Advanced Configs**](#Common-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------------- | -------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [as.of.instant](#asofinstant) | N/A **(Required)** | The query instant for time travel. Without specified this option, we query the latest snapshot. | | -| [hoodie.common.diskmap.compression.enabled](#hoodiecommondiskmapcompressionenabled) | true (Optional) | Turn on compression for BITCASK disk map used by the External Spillable Map | | -| [hoodie.common.spillable.diskmap.type](#hoodiecommonspillablediskmaptype) | BITCASK (Optional) | When handling input data that cannot be held in memory, to merge with a file on storage, a spillable diskmap is employed. By default, we use a persistent hashmap based loosely on bitcask, that offers O(1) inserts, lookups. Change this to `ROCKS_DB` to prefer using rocksDB, for handling the spill. | | -| [hoodie.datasource.read.handle.hollow.commit](#hoodiedatasourcereadhandlehollowcommit) | EXCEPTION (Optional) | When doing incremental queries, there could be hollow commits (requested or inflight commits that are not the latest) that are produced by concurrent writers and could lead to potential data loss. This config allows users to have different ways of handling this situation. The valid values are [EXCEPTION, BLOCK, USE_STATE_TRANSITION_TIME]: Use `EXCEPTION` to throw an exception when hollow commit is detected. This is helpful when hollow commits are not expected. Use `BLOCK` to block processing commits from going beyond the hollow ones. This fits the case where waiting for hollow commits to finish is acceptable. Use `USE_STATE_TRANSITION_TIME` (experimental) to query commits in range by state transition time (completion time), instead of commit time (start time). Using this mode will result in `begin.instanttime` and `end.instanttime` using `stateTransitionTime` instead of the instant's commit time. | 0.14.0 | -| [hoodie.datasource.write.reconcile.schema](#hoodiedatasourcewritereconcileschema) | false (Optional) | This config controls how writer's schema will be selected based on the incoming batch's schema as well as existing table's one. When schema reconciliation is DISABLED, incoming batch's schema will be picked as a writer-schema (therefore updating table's schema). When schema reconciliation is ENABLED, writer-schema will be picked such that table's schema (after txn) is either kept the same or extended, meaning that we'll always prefer the schema that either adds new columns or stays the same. This enables us, to always extend the table's schema during evolution and never lose the data (when, for ex, existing column is being dropped in a new batch) | | -| [hoodie.fs.atomic_creation.support](#hoodiefsatomic_creationsupport) | (Optional) | This config is used to specify the file system which supports atomic file creation . atomic means that an operation either succeeds and has an effect or has fails and has no effect; now this feature is used by FileSystemLockProvider to guaranteeing that only one writer can create the lock file at a time. since some FS does not support atomic file creation (eg: S3), we decide the FileSystemLockProvider only support HDFS,local FS and View FS as default. if you want to use FileSystemLockProvider with other FS, you can set this config with the FS scheme, eg: fs1,fs2 | | -| [hoodie.schema.on.read.enable](#hoodieschemaonreadenable) | false (Optional) | Enables support for Schema Evolution feature | | +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------- | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [as.of.instant](#asofinstant) | N/A **(Required)** | The query instant for time travel. Without specified this option, we query the latest snapshot.

`Config Param: TIMESTAMP_AS_OF` | +| [hoodie.common.diskmap.compression.enabled](#hoodiecommondiskmapcompressionenabled) | true (Optional) | Turn on compression for BITCASK disk map used by the External Spillable Map

`Config Param: DISK_MAP_BITCASK_COMPRESSION_ENABLED` | +| [hoodie.common.spillable.diskmap.type](#hoodiecommonspillablediskmaptype) | BITCASK (Optional) | When handling input data that cannot be held in memory, to merge with a file on storage, a spillable diskmap is employed. By default, we use a persistent hashmap based loosely on bitcask, that offers O(1) inserts, lookups. Change this to `ROCKS_DB` to prefer using rocksDB, for handling the spill.

`Config Param: SPILLABLE_DISK_MAP_TYPE` | +| [hoodie.datasource.read.handle.hollow.commit](#hoodiedatasourcereadhandlehollowcommit) | EXCEPTION (Optional) | When doing incremental queries, there could be hollow commits (requested or inflight commits that are not the latest) that are produced by concurrent writers and could lead to potential data loss. This config allows users to have different ways of handling this situation. The valid values are [EXCEPTION, BLOCK, USE_STATE_TRANSITION_TIME]: Use `EXCEPTION` to throw an exception when hollow commit is detected. This is helpful when hollow commits are not expected. Use `BLOCK` to block processing commits from going beyond the hollow ones. This fits the case where waiting for hollow commits to finish is acceptable. Use `USE_STATE_TRANSITION_TIME` (experimental) to query commits in range by state transition time (completion time), instead of commit time (start time). Using this mode will result in `begin.instanttime` and `end.instanttime` using `stateTransitionTime` instead of the instant's commit time.

`Config Param: INCREMENTAL_READ_HANDLE_HOLLOW_COMMIT`
`Since Version: 0.14.0` | +| [hoodie.datasource.write.reconcile.schema](#hoodiedatasourcewritereconcileschema) | false (Optional) | This config controls how writer's schema will be selected based on the incoming batch's schema as well as existing table's one. When schema reconciliation is DISABLED, incoming batch's schema will be picked as a writer-schema (therefore updating table's schema). When schema reconciliation is ENABLED, writer-schema will be picked such that table's schema (after txn) is either kept the same or extended, meaning that we'll always prefer the schema that either adds new columns or stays the same. This enables us, to always extend the table's schema during evolution and never lose the data (when, for ex, existing column is being dropped in a new batch)

`Config Param: RECONCILE_SCHEMA` | +| [hoodie.fs.atomic_creation.support](#hoodiefsatomic_creationsupport) | (Optional) | This config is used to specify the file system which supports atomic file creation . atomic means that an operation either succeeds and has an effect or has fails and has no effect; now this feature is used by FileSystemLockProvider to guaranteeing that only one writer can create the lock file at a time. since some FS does not support atomic file creation (eg: S3), we decide the FileSystemLockProvider only support HDFS,local FS and View FS as default. if you want to use FileSystemLockProvider with other FS, you can set this config with the FS scheme, eg: fs1,fs2

`Config Param: HOODIE_FS_ATOMIC_CREATION_SUPPORT` | +| [hoodie.schema.on.read.enable](#hoodieschemaonreadenable) | false (Optional) | Enables support for Schema Evolution feature

`Config Param: SCHEMA_EVOLUTION_ENABLE` | --- @@ -364,46 +364,46 @@ Configurations used by the Hudi Metadata Table. This table maintains the metadat [**Basic Configs**](#Metadata-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.metadata.enable](#hoodiemetadataenable) | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings | 0.7.0 | -| [hoodie.metadata.index.bloom.filter.enable](#hoodiemetadataindexbloomfilterenable) | false (Optional) | Enable indexing bloom filters of user data files under metadata table. When enabled, metadata table will have a partition to store the bloom filter index and will be used during the index lookups. | 0.11.0 | -| [hoodie.metadata.index.column.stats.enable](#hoodiemetadataindexcolumnstatsenable) | false (Optional) | Enable indexing column ranges of user data files under metadata table key lookups. When enabled, metadata table will have a partition to store the column ranges and will be used for pruning files during the index lookups. | 0.11.0 | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------- | ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.metadata.enable](#hoodiemetadataenable) | true (Optional) | Enable the internal metadata table which serves table metadata like level file listings

`Config Param: ENABLE`
`Since Version: 0.7.0` | +| [hoodie.metadata.index.bloom.filter.enable](#hoodiemetadataindexbloomfilterenable) | false (Optional) | Enable indexing bloom filters of user data files under metadata table. When enabled, metadata table will have a partition to store the bloom filter index and will be used during the index lookups.

`Config Param: ENABLE_METADATA_INDEX_BLOOM_FILTER`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.enable](#hoodiemetadataindexcolumnstatsenable) | false (Optional) | Enable indexing column ranges of user data files under metadata table key lookups. When enabled, metadata table will have a partition to store the column ranges and will be used for pruning files during the index lookups.

`Config Param: ENABLE_METADATA_INDEX_COLUMN_STATS`
`Since Version: 0.11.0` | [**Advanced Configs**](#Metadata-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------------------ | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.metadata.index.bloom.filter.column.list](#hoodiemetadataindexbloomfiltercolumnlist) | N/A **(Required)** | Comma-separated list of columns for which bloom filter index will be built. If not set, only record key will be indexed. | 0.11.0 | -| [hoodie.metadata.index.column.stats.column.list](#hoodiemetadataindexcolumnstatscolumnlist) | N/A **(Required)** | Comma-separated list of columns for which column stats index will be built. If not set, all columns will be indexed | 0.11.0 | -| [hoodie.metadata.index.column.stats.processing.mode.override](#hoodiemetadataindexcolumnstatsprocessingmodeoverride) | N/A **(Required)** | By default Column Stats Index is automatically determining whether it should be read and processed either'in-memory' (w/in executing process) or using Spark (on a cluster), based on some factors like the size of the Index and how many columns are read. This config allows to override this behavior. | 0.12.0 | -| [_hoodie.metadata.ignore.spurious.deletes](#_hoodiemetadataignorespuriousdeletes) | true (Optional) | There are cases when extra files are requested to be deleted from metadata table which are never added before. This config determines how to handle such spurious deletes | 0.10.0 | -| [hoodie.assume.date.partitioning](#hoodieassumedatepartitioning) | false (Optional) | Should HoodieWriteClient assume the data is partitioned by dates, i.e three levels from base path. This is a stop-gap to support tables created by versions < 0.3.1. Will be removed eventually | 0.3.0 | -| [hoodie.file.listing.parallelism](#hoodiefilelistingparallelism) | 200 (Optional) | Parallelism to use, when listing the table on lake storage. | 0.7.0 | -| [hoodie.metadata.compact.max.delta.commits](#hoodiemetadatacompactmaxdeltacommits) | 10 (Optional) | Controls how often the metadata table is compacted. | 0.7.0 | -| [hoodie.metadata.dir.filter.regex](#hoodiemetadatadirfilterregex) | (Optional) | Directories matching this regex, will be filtered out when initializing metadata table from lake storage for the first time. | 0.7.0 | -| [hoodie.metadata.index.async](#hoodiemetadataindexasync) | false (Optional) | Enable asynchronous indexing of metadata table. | 0.11.0 | -| [hoodie.metadata.index.bloom.filter.file.group.count](#hoodiemetadataindexbloomfilterfilegroupcount) | 4 (Optional) | Metadata bloom filter index partition file group count. This controls the size of the base and log files and read parallelism in the bloom filter index partition. The recommendation is to size the file group count such that the base files are under 1GB. | 0.11.0 | -| [hoodie.metadata.index.bloom.filter.parallelism](#hoodiemetadataindexbloomfilterparallelism) | 200 (Optional) | Parallelism to use for generating bloom filter index in metadata table. | 0.11.0 | -| [hoodie.metadata.index.check.timeout.seconds](#hoodiemetadataindexchecktimeoutseconds) | 900 (Optional) | After the async indexer has finished indexing upto the base instant, it will ensure that all inflight writers reliably write index updates as well. If this timeout expires, then the indexer will abort itself safely. | 0.11.0 | -| [hoodie.metadata.index.column.stats.file.group.count](#hoodiemetadataindexcolumnstatsfilegroupcount) | 2 (Optional) | Metadata column stats partition file group count. This controls the size of the base and log files and read parallelism in the column stats index partition. The recommendation is to size the file group count such that the base files are under 1GB. | 0.11.0 | -| [hoodie.metadata.index.column.stats.inMemory.projection.threshold](#hoodiemetadataindexcolumnstatsinMemoryprojectionthreshold) | 100000 (Optional) | When reading Column Stats Index, if the size of the expected resulting projection is below the in-memory threshold (counted by the # of rows), it will be attempted to be loaded "in-memory" (ie not using the execution engine like Spark, Flink, etc). If the value is above the threshold execution engine will be used to compose the projection. | 0.12.0 | -| [hoodie.metadata.index.column.stats.parallelism](#hoodiemetadataindexcolumnstatsparallelism) | 200 (Optional) | Parallelism to use, when generating column stats index. | 0.11.0 | -| [hoodie.metadata.insert.parallelism](#hoodiemetadatainsertparallelism) | 1 (Optional) | Parallelism to use when inserting to the metadata table | 0.7.0 | -| [hoodie.metadata.log.compaction.blocks.threshold](#hoodiemetadatalogcompactionblocksthreshold) | 5 (Optional) | Controls the criteria to log compacted files groups in metadata table. | | -| [hoodie.metadata.log.compaction.enable](#hoodiemetadatalogcompactionenable) | false (Optional) | This configs enables logcompaction for the metadata table. | 0.14 | -| [hoodie.metadata.max.deltacommits.when_pending](#hoodiemetadatamaxdeltacommitswhen_pending) | 1000 (Optional) | When there is a pending instant in data table, this config limits the allowed number of deltacommits in metadata table to prevent the metadata table's timeline from growing unboundedly as compaction won't be triggered due to the pending data table instant. | 0.14.0 | -| [hoodie.metadata.max.reader.buffer.size](#hoodiemetadatamaxreaderbuffersize) | 10485760 (Optional) | Max memory to use for the reader buffer while merging log blocks | 0.14.0 | -| [hoodie.metadata.max.reader.memory](#hoodiemetadatamaxreadermemory) | 1073741824 (Optional) | Max memory to use for the reader to read from metadata | 0.14.0 | -| [hoodie.metadata.metrics.enable](#hoodiemetadatametricsenable) | false (Optional) | Enable publishing of metrics around metadata table. | 0.7.0 | -| [hoodie.metadata.optimized.log.blocks.scan.enable](#hoodiemetadataoptimizedlogblocksscanenable) | false (Optional) | Optimized log blocks scanner that addresses all the multi-writer use-cases while appending to log files. It also differentiates original blocks written by ingestion writers and compacted blocks written by log compaction. | 0.13.0 | -| [hoodie.metadata.record.index.enable](#hoodiemetadatarecordindexenable) | false (Optional) | Create the HUDI Record Index within the Metadata Table | 0.14.0 | -| [hoodie.metadata.record.index.growth.factor](#hoodiemetadatarecordindexgrowthfactor) | 2.0 (Optional) | The current number of records are multiplied by this number when estimating the number of file groups to create automatically. This helps account for growth in the number of records in the dataset. | 0.14.0 | -| [hoodie.metadata.record.index.max.filegroup.count](#hoodiemetadatarecordindexmaxfilegroupcount) | 1000 (Optional) | Maximum number of file groups to use for Record Index. | 0.14.0 | -| [hoodie.metadata.record.index.max.filegroup.size](#hoodiemetadatarecordindexmaxfilegroupsize) | 1073741824 (Optional) | Maximum size in bytes of a single file group. Large file group takes longer to compact. | 0.14.0 | -| [hoodie.metadata.record.index.min.filegroup.count](#hoodiemetadatarecordindexminfilegroupcount) | 10 (Optional) | Minimum number of file groups to use for Record Index. | 0.14.0 | -| [hoodie.metadata.spillable.map.path](#hoodiemetadataspillablemappath) | (Optional) | Path on local storage to use, when keys read from metadata are held in a spillable map. | 0.14.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------ | --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metadata.index.bloom.filter.column.list](#hoodiemetadataindexbloomfiltercolumnlist) | N/A **(Required)** | Comma-separated list of columns for which bloom filter index will be built. If not set, only record key will be indexed.

`Config Param: BLOOM_FILTER_INDEX_FOR_COLUMNS`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.column.list](#hoodiemetadataindexcolumnstatscolumnlist) | N/A **(Required)** | Comma-separated list of columns for which column stats index will be built. If not set, all columns will be indexed

`Config Param: COLUMN_STATS_INDEX_FOR_COLUMNS`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.processing.mode.override](#hoodiemetadataindexcolumnstatsprocessingmodeoverride) | N/A **(Required)** | By default Column Stats Index is automatically determining whether it should be read and processed either'in-memory' (w/in executing process) or using Spark (on a cluster), based on some factors like the size of the Index and how many columns are read. This config allows to override this behavior.

`Config Param: COLUMN_STATS_INDEX_PROCESSING_MODE_OVERRIDE`
`Since Version: 0.12.0` | +| [_hoodie.metadata.ignore.spurious.deletes](#_hoodiemetadataignorespuriousdeletes) | true (Optional) | There are cases when extra files are requested to be deleted from metadata table which are never added before. This config determines how to handle such spurious deletes

`Config Param: IGNORE_SPURIOUS_DELETES`
`Since Version: 0.10.0` | +| [hoodie.assume.date.partitioning](#hoodieassumedatepartitioning) | false (Optional) | Should HoodieWriteClient assume the data is partitioned by dates, i.e three levels from base path. This is a stop-gap to support tables created by versions < 0.3.1. Will be removed eventually

`Config Param: ASSUME_DATE_PARTITIONING`
`Since Version: 0.3.0` | +| [hoodie.file.listing.parallelism](#hoodiefilelistingparallelism) | 200 (Optional) | Parallelism to use, when listing the table on lake storage.

`Config Param: FILE_LISTING_PARALLELISM_VALUE`
`Since Version: 0.7.0` | +| [hoodie.metadata.compact.max.delta.commits](#hoodiemetadatacompactmaxdeltacommits) | 10 (Optional) | Controls how often the metadata table is compacted.

`Config Param: COMPACT_NUM_DELTA_COMMITS`
`Since Version: 0.7.0` | +| [hoodie.metadata.dir.filter.regex](#hoodiemetadatadirfilterregex) | (Optional) | Directories matching this regex, will be filtered out when initializing metadata table from lake storage for the first time.

`Config Param: DIR_FILTER_REGEX`
`Since Version: 0.7.0` | +| [hoodie.metadata.index.async](#hoodiemetadataindexasync) | false (Optional) | Enable asynchronous indexing of metadata table.

`Config Param: ASYNC_INDEX_ENABLE`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.bloom.filter.file.group.count](#hoodiemetadataindexbloomfilterfilegroupcount) | 4 (Optional) | Metadata bloom filter index partition file group count. This controls the size of the base and log files and read parallelism in the bloom filter index partition. The recommendation is to size the file group count such that the base files are under 1GB.

`Config Param: METADATA_INDEX_BLOOM_FILTER_FILE_GROUP_COUNT`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.bloom.filter.parallelism](#hoodiemetadataindexbloomfilterparallelism) | 200 (Optional) | Parallelism to use for generating bloom filter index in metadata table.

`Config Param: BLOOM_FILTER_INDEX_PARALLELISM`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.check.timeout.seconds](#hoodiemetadataindexchecktimeoutseconds) | 900 (Optional) | After the async indexer has finished indexing upto the base instant, it will ensure that all inflight writers reliably write index updates as well. If this timeout expires, then the indexer will abort itself safely.

`Config Param: METADATA_INDEX_CHECK_TIMEOUT_SECONDS`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.file.group.count](#hoodiemetadataindexcolumnstatsfilegroupcount) | 2 (Optional) | Metadata column stats partition file group count. This controls the size of the base and log files and read parallelism in the column stats index partition. The recommendation is to size the file group count such that the base files are under 1GB.

`Config Param: METADATA_INDEX_COLUMN_STATS_FILE_GROUP_COUNT`
`Since Version: 0.11.0` | +| [hoodie.metadata.index.column.stats.inMemory.projection.threshold](#hoodiemetadataindexcolumnstatsinMemoryprojectionthreshold) | 100000 (Optional) | When reading Column Stats Index, if the size of the expected resulting projection is below the in-memory threshold (counted by the # of rows), it will be attempted to be loaded "in-memory" (ie not using the execution engine like Spark, Flink, etc). If the value is above the threshold execution engine will be used to compose the projection.

`Config Param: COLUMN_STATS_INDEX_IN_MEMORY_PROJECTION_THRESHOLD`
`Since Version: 0.12.0` | +| [hoodie.metadata.index.column.stats.parallelism](#hoodiemetadataindexcolumnstatsparallelism) | 200 (Optional) | Parallelism to use, when generating column stats index.

`Config Param: COLUMN_STATS_INDEX_PARALLELISM`
`Since Version: 0.11.0` | +| [hoodie.metadata.insert.parallelism](#hoodiemetadatainsertparallelism) | 1 (Optional) | Parallelism to use when inserting to the metadata table

`Config Param: INSERT_PARALLELISM_VALUE`
`Since Version: 0.7.0` | +| [hoodie.metadata.log.compaction.blocks.threshold](#hoodiemetadatalogcompactionblocksthreshold) | 5 (Optional) | Controls the criteria to log compacted files groups in metadata table.

`Config Param: LOG_COMPACT_BLOCKS_THRESHOLD` | +| [hoodie.metadata.log.compaction.enable](#hoodiemetadatalogcompactionenable) | false (Optional) | This configs enables logcompaction for the metadata table.

`Config Param: ENABLE_LOG_COMPACTION_ON_METADATA_TABLE`
`Since Version: 0.14` | +| [hoodie.metadata.max.deltacommits.when_pending](#hoodiemetadatamaxdeltacommitswhen_pending) | 1000 (Optional) | When there is a pending instant in data table, this config limits the allowed number of deltacommits in metadata table to prevent the metadata table's timeline from growing unboundedly as compaction won't be triggered due to the pending data table instant.

`Config Param: METADATA_MAX_NUM_DELTACOMMITS_WHEN_PENDING`
`Since Version: 0.14.0` | +| [hoodie.metadata.max.reader.buffer.size](#hoodiemetadatamaxreaderbuffersize) | 10485760 (Optional) | Max memory to use for the reader buffer while merging log blocks

`Config Param: MAX_READER_BUFFER_SIZE_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.max.reader.memory](#hoodiemetadatamaxreadermemory) | 1073741824 (Optional) | Max memory to use for the reader to read from metadata

`Config Param: MAX_READER_MEMORY_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.metrics.enable](#hoodiemetadatametricsenable) | false (Optional) | Enable publishing of metrics around metadata table.

`Config Param: METRICS_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.metadata.optimized.log.blocks.scan.enable](#hoodiemetadataoptimizedlogblocksscanenable) | false (Optional) | Optimized log blocks scanner that addresses all the multi-writer use-cases while appending to log files. It also differentiates original blocks written by ingestion writers and compacted blocks written by log compaction.

`Config Param: ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN`
`Since Version: 0.13.0` | +| [hoodie.metadata.record.index.enable](#hoodiemetadatarecordindexenable) | false (Optional) | Create the HUDI Record Index within the Metadata Table

`Config Param: RECORD_INDEX_ENABLE_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.record.index.growth.factor](#hoodiemetadatarecordindexgrowthfactor) | 2.0 (Optional) | The current number of records are multiplied by this number when estimating the number of file groups to create automatically. This helps account for growth in the number of records in the dataset.

`Config Param: RECORD_INDEX_GROWTH_FACTOR_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.record.index.max.filegroup.count](#hoodiemetadatarecordindexmaxfilegroupcount) | 1000 (Optional) | Maximum number of file groups to use for Record Index.

`Config Param: RECORD_INDEX_MAX_FILE_GROUP_COUNT_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.record.index.max.filegroup.size](#hoodiemetadatarecordindexmaxfilegroupsize) | 1073741824 (Optional) | Maximum size in bytes of a single file group. Large file group takes longer to compact.

`Config Param: RECORD_INDEX_MAX_FILE_GROUP_SIZE_BYTES_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.record.index.min.filegroup.count](#hoodiemetadatarecordindexminfilegroupcount) | 10 (Optional) | Minimum number of file groups to use for Record Index.

`Config Param: RECORD_INDEX_MIN_FILE_GROUP_COUNT_PROP`
`Since Version: 0.14.0` | +| [hoodie.metadata.spillable.map.path](#hoodiemetadataspillablemappath) | (Optional) | Path on local storage to use, when keys read from metadata are held in a spillable map.

`Config Param: SPILLABLE_MAP_DIR_PROP`
`Since Version: 0.14.0` | --- @@ -415,14 +415,14 @@ Configurations used by the Hudi Metaserver. [**Advanced Configs**](#Metaserver-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------- | ---------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.database.name](#hoodiedatabasename) | N/A **(Required)** | Database name that will be used for incremental query.If different databases have the same table name during incremental query, we can set it to limit the table name under a specific database | 0.13.0 | -| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name that will be used for registering with Hive. Needs to be same across runs. | 0.13.0 | -| [hoodie.metaserver.connect.retries](#hoodiemetaserverconnectretries) | 3 (Optional) | Number of retries while opening a connection to metaserver | 0.13.0 | -| [hoodie.metaserver.connect.retry.delay](#hoodiemetaserverconnectretrydelay) | 1 (Optional) | Number of seconds for the client to wait between consecutive connection attempts | 0.13.0 | -| [hoodie.metaserver.enabled](#hoodiemetaserverenabled) | false (Optional) | Enable Hudi metaserver for storing Hudi tables' metadata. | 0.13.0 | -| [hoodie.metaserver.uris](#hoodiemetaserveruris) | thrift://localhost:9090 (Optional) | Metaserver server uris | 0.13.0 | +| Config Name | Default | Description | +| --------------------------------------------------------------------------- | ---------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.database.name](#hoodiedatabasename) | N/A **(Required)** | Database name that will be used for incremental query.If different databases have the same table name during incremental query, we can set it to limit the table name under a specific database

`Config Param: DATABASE_NAME`
`Since Version: 0.13.0` | +| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name that will be used for registering with Hive. Needs to be same across runs.

`Config Param: TABLE_NAME`
`Since Version: 0.13.0` | +| [hoodie.metaserver.connect.retries](#hoodiemetaserverconnectretries) | 3 (Optional) | Number of retries while opening a connection to metaserver

`Config Param: METASERVER_CONNECTION_RETRIES`
`Since Version: 0.13.0` | +| [hoodie.metaserver.connect.retry.delay](#hoodiemetaserverconnectretrydelay) | 1 (Optional) | Number of seconds for the client to wait between consecutive connection attempts

`Config Param: METASERVER_CONNECTION_RETRY_DELAY`
`Since Version: 0.13.0` | +| [hoodie.metaserver.enabled](#hoodiemetaserverenabled) | false (Optional) | Enable Hudi metaserver for storing Hudi tables' metadata.

`Config Param: METASERVER_ENABLE`
`Since Version: 0.13.0` | +| [hoodie.metaserver.uris](#hoodiemetaserveruris) | thrift://localhost:9090 (Optional) | Metaserver server uris

`Config Param: METASERVER_URLS`
`Since Version: 0.13.0` | --- @@ -434,35 +434,35 @@ Configurations that control aspects around writing, sizing, reading base and log [**Basic Configs**](#Storage-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------ | -------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.parquet.compression.codec](#hoodieparquetcompressioncodec) | gzip (Optional) | Compression Codec for parquet files | | -| [hoodie.parquet.max.file.size](#hoodieparquetmaxfilesize) | 125829120 (Optional) | Target size in bytes for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------ | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.parquet.compression.codec](#hoodieparquetcompressioncodec) | gzip (Optional) | Compression Codec for parquet files

`Config Param: PARQUET_COMPRESSION_CODEC_NAME` | +| [hoodie.parquet.max.file.size](#hoodieparquetmaxfilesize) | 125829120 (Optional) | Target size in bytes for parquet files produced by Hudi write phases. For DFS, this needs to be aligned with the underlying filesystem block size for optimal performance.

`Config Param: PARQUET_MAX_FILE_SIZE` | [**Advanced Configs**](#Storage-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------------- | ------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.logfile.data.block.format](#hoodielogfiledatablockformat) | N/A **(Required)** | Format of the data block within delta logs. Following formats are currently supported "avro", "hfile", "parquet" | | -| [hoodie.avro.write.support.class](#hoodieavrowritesupportclass) | org.apache.hudi.avro.HoodieAvroWriteSupport (Optional) | Provided write support class should extend HoodieAvroWriteSupport class and it is loaded at runtime. This is only required when trying to override the existing write context. | 0.14.0 | -| [hoodie.hfile.block.size](#hoodiehfileblocksize) | 1048576 (Optional) | Lower values increase the size in bytes of metadata tracked within HFile, but can offer potentially faster lookup times. | | -| [hoodie.hfile.compression.algorithm](#hoodiehfilecompressionalgorithm) | GZ (Optional) | Compression codec to use for hfile base files. | | -| [hoodie.hfile.max.file.size](#hoodiehfilemaxfilesize) | 125829120 (Optional) | Target file size in bytes for HFile base files. | | -| [hoodie.logfile.data.block.max.size](#hoodielogfiledatablockmaxsize) | 268435456 (Optional) | LogFile Data block max size in bytes. This is the maximum size allowed for a single data block to be appended to a log file. This helps to make sure the data appended to the log file is broken up into sizable blocks to prevent from OOM errors. This size should be greater than the JVM memory. | | -| [hoodie.logfile.max.size](#hoodielogfilemaxsize) | 1073741824 (Optional) | LogFile max size in bytes. This is the maximum size allowed for a log file before it is rolled over to the next version. | | -| [hoodie.logfile.to.parquet.compression.ratio](#hoodielogfiletoparquetcompressionratio) | 0.35 (Optional) | Expected additional compression as records move from log files to parquet. Used for merge_on_read table to send inserts into log files & control the size of compacted parquet file. | | -| [hoodie.orc.block.size](#hoodieorcblocksize) | 125829120 (Optional) | ORC block size, recommended to be aligned with the target file size. | | -| [hoodie.orc.compression.codec](#hoodieorccompressioncodec) | ZLIB (Optional) | Compression codec to use for ORC base files. | | -| [hoodie.orc.max.file.size](#hoodieorcmaxfilesize) | 125829120 (Optional) | Target file size in bytes for ORC base files. | | -| [hoodie.orc.stripe.size](#hoodieorcstripesize) | 67108864 (Optional) | Size of the memory buffer in bytes for writing | | -| [hoodie.parquet.block.size](#hoodieparquetblocksize) | 125829120 (Optional) | Parquet RowGroup size in bytes. It's recommended to make this large enough that scan costs can be amortized by packing enough column values into a single row group. | | -| [hoodie.parquet.compression.ratio](#hoodieparquetcompressionratio) | 0.1 (Optional) | Expected compression of parquet data used by Hudi, when it tries to size new parquet files. Increase this value, if bulk_insert is producing smaller than expected sized files | | -| [hoodie.parquet.dictionary.enabled](#hoodieparquetdictionaryenabled) | true (Optional) | Whether to use dictionary encoding | | -| [hoodie.parquet.field_id.write.enabled](#hoodieparquetfield_idwriteenabled) | true (Optional) | Would only be effective with Spark 3.3+. Sets spark.sql.parquet.fieldId.write.enabled. If enabled, Spark will write out parquet native field ids that are stored inside StructField's metadata as parquet.field.id to parquet files. | 0.12.0 | -| [hoodie.parquet.outputtimestamptype](#hoodieparquetoutputtimestamptype) | TIMESTAMP_MICROS (Optional) | Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files. | | -| [hoodie.parquet.page.size](#hoodieparquetpagesize) | 1048576 (Optional) | Parquet page size in bytes. Page is the unit of read within a parquet file. Within a block, pages are compressed separately. | | -| [hoodie.parquet.writelegacyformat.enabled](#hoodieparquetwritelegacyformatenabled) | false (Optional) | Sets spark.sql.parquet.writeLegacyFormat. If true, data will be written in a way of Spark 1.4 and earlier. For example, decimal values will be written in Parquet's fixed-length byte array format which other systems such as Apache Hive and Apache Impala use. If false, the newer format in Parquet will be used. For example, decimals will be written in int-based format. | | +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------- | ------------------------------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.logfile.data.block.format](#hoodielogfiledatablockformat) | N/A **(Required)** | Format of the data block within delta logs. Following formats are currently supported "avro", "hfile", "parquet"

`Config Param: LOGFILE_DATA_BLOCK_FORMAT` | +| [hoodie.avro.write.support.class](#hoodieavrowritesupportclass) | org.apache.hudi.avro.HoodieAvroWriteSupport (Optional) | Provided write support class should extend HoodieAvroWriteSupport class and it is loaded at runtime. This is only required when trying to override the existing write context.

`Config Param: HOODIE_AVRO_WRITE_SUPPORT_CLASS`
`Since Version: 0.14.0` | +| [hoodie.hfile.block.size](#hoodiehfileblocksize) | 1048576 (Optional) | Lower values increase the size in bytes of metadata tracked within HFile, but can offer potentially faster lookup times.

`Config Param: HFILE_BLOCK_SIZE` | +| [hoodie.hfile.compression.algorithm](#hoodiehfilecompressionalgorithm) | GZ (Optional) | Compression codec to use for hfile base files.

`Config Param: HFILE_COMPRESSION_ALGORITHM_NAME` | +| [hoodie.hfile.max.file.size](#hoodiehfilemaxfilesize) | 125829120 (Optional) | Target file size in bytes for HFile base files.

`Config Param: HFILE_MAX_FILE_SIZE` | +| [hoodie.logfile.data.block.max.size](#hoodielogfiledatablockmaxsize) | 268435456 (Optional) | LogFile Data block max size in bytes. This is the maximum size allowed for a single data block to be appended to a log file. This helps to make sure the data appended to the log file is broken up into sizable blocks to prevent from OOM errors. This size should be greater than the JVM memory.

`Config Param: LOGFILE_DATA_BLOCK_MAX_SIZE` | +| [hoodie.logfile.max.size](#hoodielogfilemaxsize) | 1073741824 (Optional) | LogFile max size in bytes. This is the maximum size allowed for a log file before it is rolled over to the next version.

`Config Param: LOGFILE_MAX_SIZE` | +| [hoodie.logfile.to.parquet.compression.ratio](#hoodielogfiletoparquetcompressionratio) | 0.35 (Optional) | Expected additional compression as records move from log files to parquet. Used for merge_on_read table to send inserts into log files & control the size of compacted parquet file.

`Config Param: LOGFILE_TO_PARQUET_COMPRESSION_RATIO_FRACTION` | +| [hoodie.orc.block.size](#hoodieorcblocksize) | 125829120 (Optional) | ORC block size, recommended to be aligned with the target file size.

`Config Param: ORC_BLOCK_SIZE` | +| [hoodie.orc.compression.codec](#hoodieorccompressioncodec) | ZLIB (Optional) | Compression codec to use for ORC base files.

`Config Param: ORC_COMPRESSION_CODEC_NAME` | +| [hoodie.orc.max.file.size](#hoodieorcmaxfilesize) | 125829120 (Optional) | Target file size in bytes for ORC base files.

`Config Param: ORC_FILE_MAX_SIZE` | +| [hoodie.orc.stripe.size](#hoodieorcstripesize) | 67108864 (Optional) | Size of the memory buffer in bytes for writing

`Config Param: ORC_STRIPE_SIZE` | +| [hoodie.parquet.block.size](#hoodieparquetblocksize) | 125829120 (Optional) | Parquet RowGroup size in bytes. It's recommended to make this large enough that scan costs can be amortized by packing enough column values into a single row group.

`Config Param: PARQUET_BLOCK_SIZE` | +| [hoodie.parquet.compression.ratio](#hoodieparquetcompressionratio) | 0.1 (Optional) | Expected compression of parquet data used by Hudi, when it tries to size new parquet files. Increase this value, if bulk_insert is producing smaller than expected sized files

`Config Param: PARQUET_COMPRESSION_RATIO_FRACTION` | +| [hoodie.parquet.dictionary.enabled](#hoodieparquetdictionaryenabled) | true (Optional) | Whether to use dictionary encoding

`Config Param: PARQUET_DICTIONARY_ENABLED` | +| [hoodie.parquet.field_id.write.enabled](#hoodieparquetfield_idwriteenabled) | true (Optional) | Would only be effective with Spark 3.3+. Sets spark.sql.parquet.fieldId.write.enabled. If enabled, Spark will write out parquet native field ids that are stored inside StructField's metadata as parquet.field.id to parquet files.

`Config Param: PARQUET_FIELD_ID_WRITE_ENABLED`
`Since Version: 0.12.0` | +| [hoodie.parquet.outputtimestamptype](#hoodieparquetoutputtimestamptype) | TIMESTAMP_MICROS (Optional) | Sets spark.sql.parquet.outputTimestampType. Parquet timestamp type to use when Spark writes data to Parquet files.

`Config Param: PARQUET_OUTPUT_TIMESTAMP_TYPE` | +| [hoodie.parquet.page.size](#hoodieparquetpagesize) | 1048576 (Optional) | Parquet page size in bytes. Page is the unit of read within a parquet file. Within a block, pages are compressed separately.

`Config Param: PARQUET_PAGE_SIZE` | +| [hoodie.parquet.writelegacyformat.enabled](#hoodieparquetwritelegacyformatenabled) | false (Optional) | Sets spark.sql.parquet.writeLegacyFormat. If true, data will be written in a way of Spark 1.4 and earlier. For example, decimal values will be written in Parquet's fixed-length byte array format which other systems such as Apache Hive and Apache Impala use. If false, the newer format in Parquet will be used. For example, decimals will be written in int-based format.

`Config Param: PARQUET_WRITE_LEGACY_FORMAT_ENABLED` | --- @@ -474,14 +474,14 @@ The consistency guard related config options, to help talk to eventually consist [**Advanced Configs**](#Consistency-Guard-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------ | -| [_hoodie.optimistic.consistency.guard.enable](#_hoodieoptimisticconsistencyguardenable) | false (Optional) | Enable consistency guard, which optimistically assumes consistency is achieved after a certain time period. | 0.6.0 | -| [hoodie.consistency.check.enabled](#hoodieconsistencycheckenabled) | false (Optional) | Enabled to handle S3 eventual consistency issue. This property is no longer required since S3 is now strongly consistent. Will be removed in the future releases. | 0.5.0. Deprecated since: 0.7.0 | -| [hoodie.consistency.check.initial_interval_ms](#hoodieconsistencycheckinitial_interval_ms) | 400 (Optional) | Amount of time (in ms) to wait, before checking for consistency after an operation on storage. | 0.5.0. Deprecated since: 0.7.0 | -| [hoodie.consistency.check.max_checks](#hoodieconsistencycheckmax_checks) | 6 (Optional) | Maximum number of consistency checks to perform, with exponential backoff. | 0.5.0. Deprecated since: 0.7.0 | -| [hoodie.consistency.check.max_interval_ms](#hoodieconsistencycheckmax_interval_ms) | 20000 (Optional) | Maximum amount of time (in ms), to wait for consistency checking. | 0.5.0. Deprecated since: 0.7.0 | -| [hoodie.optimistic.consistency.guard.sleep_time_ms](#hoodieoptimisticconsistencyguardsleep_time_ms) | 500 (Optional) | Amount of time (in ms), to wait after which we assume storage is consistent. | 0.6.0 | +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [_hoodie.optimistic.consistency.guard.enable](#_hoodieoptimisticconsistencyguardenable) | false (Optional) | Enable consistency guard, which optimistically assumes consistency is achieved after a certain time period.

`Config Param: OPTIMISTIC_CONSISTENCY_GUARD_ENABLE`
`Since Version: 0.6.0` | +| [hoodie.consistency.check.enabled](#hoodieconsistencycheckenabled) | false (Optional) | Enabled to handle S3 eventual consistency issue. This property is no longer required since S3 is now strongly consistent. Will be removed in the future releases.

`Config Param: ENABLE`
`Since Version: 0.5.0`
`Deprecated since: 0.7.0` | +| [hoodie.consistency.check.initial_interval_ms](#hoodieconsistencycheckinitial_interval_ms) | 400 (Optional) | Amount of time (in ms) to wait, before checking for consistency after an operation on storage.

`Config Param: INITIAL_CHECK_INTERVAL_MS`
`Since Version: 0.5.0`
`Deprecated since: 0.7.0` | +| [hoodie.consistency.check.max_checks](#hoodieconsistencycheckmax_checks) | 6 (Optional) | Maximum number of consistency checks to perform, with exponential backoff.

`Config Param: MAX_CHECKS`
`Since Version: 0.5.0`
`Deprecated since: 0.7.0` | +| [hoodie.consistency.check.max_interval_ms](#hoodieconsistencycheckmax_interval_ms) | 20000 (Optional) | Maximum amount of time (in ms), to wait for consistency checking.

`Config Param: MAX_CHECK_INTERVAL_MS`
`Since Version: 0.5.0`
`Deprecated since: 0.7.0` | +| [hoodie.optimistic.consistency.guard.sleep_time_ms](#hoodieoptimisticconsistencyguardsleep_time_ms) | 500 (Optional) | Amount of time (in ms), to wait after which we assume storage is consistent.

`Config Param: OPTIMISTIC_CONSISTENCY_GUARD_SLEEP_TIME_MS`
`Since Version: 0.6.0` | --- @@ -493,13 +493,13 @@ The filesystem retry related config options, to help deal with runtime exception [**Advanced Configs**](#FileSystem-Guard-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.filesystem.operation.retry.enable](#hoodiefilesystemoperationretryenable) | false (Optional) | Enabled to handle list/get/delete etc file system performance issue. | 0.11.0 | -| [hoodie.filesystem.operation.retry.exceptions](#hoodiefilesystemoperationretryexceptions) | (Optional) | The class name of the Exception that needs to be retried, separated by commas. Default is empty which means retry all the IOException and RuntimeException from FileSystem | 0.11.0 | -| [hoodie.filesystem.operation.retry.initial_interval_ms](#hoodiefilesystemoperationretryinitial_interval_ms) | 100 (Optional) | Amount of time (in ms) to wait, before retry to do operations on storage. | 0.11.0 | -| [hoodie.filesystem.operation.retry.max_interval_ms](#hoodiefilesystemoperationretrymax_interval_ms) | 2000 (Optional) | Maximum amount of time (in ms), to wait for next retry. | 0.11.0 | -| [hoodie.filesystem.operation.retry.max_numbers](#hoodiefilesystemoperationretrymax_numbers) | 4 (Optional) | Maximum number of retry actions to perform, with exponential backoff. | 0.11.0 | +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------------- | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.filesystem.operation.retry.enable](#hoodiefilesystemoperationretryenable) | false (Optional) | Enabled to handle list/get/delete etc file system performance issue.

`Config Param: FILESYSTEM_RETRY_ENABLE`
`Since Version: 0.11.0` | +| [hoodie.filesystem.operation.retry.exceptions](#hoodiefilesystemoperationretryexceptions) | (Optional) | The class name of the Exception that needs to be retried, separated by commas. Default is empty which means retry all the IOException and RuntimeException from FileSystem

`Config Param: RETRY_EXCEPTIONS`
`Since Version: 0.11.0` | +| [hoodie.filesystem.operation.retry.initial_interval_ms](#hoodiefilesystemoperationretryinitial_interval_ms) | 100 (Optional) | Amount of time (in ms) to wait, before retry to do operations on storage.

`Config Param: INITIAL_RETRY_INTERVAL_MS`
`Since Version: 0.11.0` | +| [hoodie.filesystem.operation.retry.max_interval_ms](#hoodiefilesystemoperationretrymax_interval_ms) | 2000 (Optional) | Maximum amount of time (in ms), to wait for next retry.

`Config Param: MAX_RETRY_INTERVAL_MS`
`Since Version: 0.11.0` | +| [hoodie.filesystem.operation.retry.max_numbers](#hoodiefilesystemoperationretrymax_numbers) | 4 (Optional) | Maximum number of retry actions to perform, with exponential backoff.

`Config Param: MAX_RETRY_NUMBERS`
`Since Version: 0.11.0` | --- @@ -511,28 +511,28 @@ Configurations that control how file metadata is stored by Hudi, for transaction [**Advanced Configs**](#File-System-View-Storage-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.filesystem.remote.backup.view.enable](#hoodiefilesystemremotebackupviewenable) | true (Optional) | Config to control whether backup needs to be configured if clients were not able to reach timeline service. | | -| [hoodie.filesystem.view.incr.timeline.sync.enable](#hoodiefilesystemviewincrtimelinesyncenable) | false (Optional) | Controls whether or not, the file system view is incrementally updated as new actions are performed on the timeline. | | -| [hoodie.filesystem.view.remote.host](#hoodiefilesystemviewremotehost) | localhost (Optional) | We expect this to be rarely hand configured. | | -| [hoodie.filesystem.view.remote.port](#hoodiefilesystemviewremoteport) | 26754 (Optional) | Port to serve file system view queries, when remote. We expect this to be rarely hand configured. | | -| [hoodie.filesystem.view.remote.retry.enable](#hoodiefilesystemviewremoteretryenable) | false (Optional) | Whether to enable API request retry for remote file system view. | 0.12.1 | -| [hoodie.filesystem.view.remote.retry.exceptions](#hoodiefilesystemviewremoteretryexceptions) | (Optional) | The class name of the Exception that needs to be retried, separated by commas. Default is empty which means retry all the IOException and RuntimeException from Remote Request. | 0.12.1 | -| [hoodie.filesystem.view.remote.retry.initial_interval_ms](#hoodiefilesystemviewremoteretryinitial_interval_ms) | 100 (Optional) | Amount of time (in ms) to wait, before retry to do operations on storage. | 0.12.1 | -| [hoodie.filesystem.view.remote.retry.max_interval_ms](#hoodiefilesystemviewremoteretrymax_interval_ms) | 2000 (Optional) | Maximum amount of time (in ms), to wait for next retry. | 0.12.1 | -| [hoodie.filesystem.view.remote.retry.max_numbers](#hoodiefilesystemviewremoteretrymax_numbers) | 3 (Optional) | Maximum number of retry for API requests against a remote file system view. e.g timeline server. | 0.12.1 | -| [hoodie.filesystem.view.remote.timeout.secs](#hoodiefilesystemviewremotetimeoutsecs) | 300 (Optional) | Timeout in seconds, to wait for API requests against a remote file system view. e.g timeline server. | | -| [hoodie.filesystem.view.rocksdb.base.path](#hoodiefilesystemviewrocksdbbasepath) | /tmp/hoodie_timeline_rocksdb (Optional) | Path on local storage to use, when storing file system view in embedded kv store/rocksdb. | | -| [hoodie.filesystem.view.secondary.type](#hoodiefilesystemviewsecondarytype) | MEMORY (Optional) | Specifies the secondary form of storage for file system view, if the primary (e.g timeline server) is unavailable. | | -| [hoodie.filesystem.view.spillable.bootstrap.base.file.mem.fraction](#hoodiefilesystemviewspillablebootstrapbasefilememfraction) | 0.05 (Optional) | Fraction of the file system view memory, to be used for holding mapping to bootstrap base files. | | -| [hoodie.filesystem.view.spillable.clustering.mem.fraction](#hoodiefilesystemviewspillableclusteringmemfraction) | 0.01 (Optional) | Fraction of the file system view memory, to be used for holding clustering related metadata. | | -| [hoodie.filesystem.view.spillable.compaction.mem.fraction](#hoodiefilesystemviewspillablecompactionmemfraction) | 0.8 (Optional) | Fraction of the file system view memory, to be used for holding compaction related metadata. | | -| [hoodie.filesystem.view.spillable.dir](#hoodiefilesystemviewspillabledir) | /tmp/ (Optional) | Path on local storage to use, when file system view is held in a spillable map. | | -| [hoodie.filesystem.view.spillable.log.compaction.mem.fraction](#hoodiefilesystemviewspillablelogcompactionmemfraction) | 0.8 (Optional) | Fraction of the file system view memory, to be used for holding log compaction related metadata. | 0.13.0 | -| [hoodie.filesystem.view.spillable.mem](#hoodiefilesystemviewspillablemem) | 104857600 (Optional) | Amount of memory to be used in bytes for holding file system view, before spilling to disk. | | -| [hoodie.filesystem.view.spillable.replaced.mem.fraction](#hoodiefilesystemviewspillablereplacedmemfraction) | 0.01 (Optional) | Fraction of the file system view memory, to be used for holding replace commit related metadata. | | -| [hoodie.filesystem.view.type](#hoodiefilesystemviewtype) | MEMORY (Optional) | File system view provides APIs for viewing the files on the underlying lake storage, as file groups and file slices. This config controls how such a view is held. Options include MEMORY,SPILLABLE_DISK,EMBEDDED_KV_STORE,REMOTE_ONLY,REMOTE_FIRST which provide different trade offs for memory usage and API request performance. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------- | --------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.filesystem.remote.backup.view.enable](#hoodiefilesystemremotebackupviewenable) | true (Optional) | Config to control whether backup needs to be configured if clients were not able to reach timeline service.

`Config Param: REMOTE_BACKUP_VIEW_ENABLE` | +| [hoodie.filesystem.view.incr.timeline.sync.enable](#hoodiefilesystemviewincrtimelinesyncenable) | false (Optional) | Controls whether or not, the file system view is incrementally updated as new actions are performed on the timeline.

`Config Param: INCREMENTAL_TIMELINE_SYNC_ENABLE` | +| [hoodie.filesystem.view.remote.host](#hoodiefilesystemviewremotehost) | localhost (Optional) | We expect this to be rarely hand configured.

`Config Param: REMOTE_HOST_NAME` | +| [hoodie.filesystem.view.remote.port](#hoodiefilesystemviewremoteport) | 26754 (Optional) | Port to serve file system view queries, when remote. We expect this to be rarely hand configured.

`Config Param: REMOTE_PORT_NUM` | +| [hoodie.filesystem.view.remote.retry.enable](#hoodiefilesystemviewremoteretryenable) | false (Optional) | Whether to enable API request retry for remote file system view.

`Config Param: REMOTE_RETRY_ENABLE`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.retry.exceptions](#hoodiefilesystemviewremoteretryexceptions) | (Optional) | The class name of the Exception that needs to be retried, separated by commas. Default is empty which means retry all the IOException and RuntimeException from Remote Request.

`Config Param: RETRY_EXCEPTIONS`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.retry.initial_interval_ms](#hoodiefilesystemviewremoteretryinitial_interval_ms) | 100 (Optional) | Amount of time (in ms) to wait, before retry to do operations on storage.

`Config Param: REMOTE_INITIAL_RETRY_INTERVAL_MS`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.retry.max_interval_ms](#hoodiefilesystemviewremoteretrymax_interval_ms) | 2000 (Optional) | Maximum amount of time (in ms), to wait for next retry.

`Config Param: REMOTE_MAX_RETRY_INTERVAL_MS`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.retry.max_numbers](#hoodiefilesystemviewremoteretrymax_numbers) | 3 (Optional) | Maximum number of retry for API requests against a remote file system view. e.g timeline server.

`Config Param: REMOTE_MAX_RETRY_NUMBERS`
`Since Version: 0.12.1` | +| [hoodie.filesystem.view.remote.timeout.secs](#hoodiefilesystemviewremotetimeoutsecs) | 300 (Optional) | Timeout in seconds, to wait for API requests against a remote file system view. e.g timeline server.

`Config Param: REMOTE_TIMEOUT_SECS` | +| [hoodie.filesystem.view.rocksdb.base.path](#hoodiefilesystemviewrocksdbbasepath) | /tmp/hoodie_timeline_rocksdb (Optional) | Path on local storage to use, when storing file system view in embedded kv store/rocksdb.

`Config Param: ROCKSDB_BASE_PATH` | +| [hoodie.filesystem.view.secondary.type](#hoodiefilesystemviewsecondarytype) | MEMORY (Optional) | Specifies the secondary form of storage for file system view, if the primary (e.g timeline server) is unavailable.

`Config Param: SECONDARY_VIEW_TYPE` | +| [hoodie.filesystem.view.spillable.bootstrap.base.file.mem.fraction](#hoodiefilesystemviewspillablebootstrapbasefilememfraction) | 0.05 (Optional) | Fraction of the file system view memory, to be used for holding mapping to bootstrap base files.

`Config Param: BOOTSTRAP_BASE_FILE_MEM_FRACTION` | +| [hoodie.filesystem.view.spillable.clustering.mem.fraction](#hoodiefilesystemviewspillableclusteringmemfraction) | 0.01 (Optional) | Fraction of the file system view memory, to be used for holding clustering related metadata.

`Config Param: SPILLABLE_CLUSTERING_MEM_FRACTION` | +| [hoodie.filesystem.view.spillable.compaction.mem.fraction](#hoodiefilesystemviewspillablecompactionmemfraction) | 0.8 (Optional) | Fraction of the file system view memory, to be used for holding compaction related metadata.

`Config Param: SPILLABLE_COMPACTION_MEM_FRACTION` | +| [hoodie.filesystem.view.spillable.dir](#hoodiefilesystemviewspillabledir) | /tmp/ (Optional) | Path on local storage to use, when file system view is held in a spillable map.

`Config Param: SPILLABLE_DIR` | +| [hoodie.filesystem.view.spillable.log.compaction.mem.fraction](#hoodiefilesystemviewspillablelogcompactionmemfraction) | 0.8 (Optional) | Fraction of the file system view memory, to be used for holding log compaction related metadata.

`Config Param: SPILLABLE_LOG_COMPACTION_MEM_FRACTION`
`Since Version: 0.13.0` | +| [hoodie.filesystem.view.spillable.mem](#hoodiefilesystemviewspillablemem) | 104857600 (Optional) | Amount of memory to be used in bytes for holding file system view, before spilling to disk.

`Config Param: SPILLABLE_MEMORY` | +| [hoodie.filesystem.view.spillable.replaced.mem.fraction](#hoodiefilesystemviewspillablereplacedmemfraction) | 0.01 (Optional) | Fraction of the file system view memory, to be used for holding replace commit related metadata.

`Config Param: SPILLABLE_REPLACED_MEM_FRACTION` | +| [hoodie.filesystem.view.type](#hoodiefilesystemviewtype) | MEMORY (Optional) | File system view provides APIs for viewing the files on the underlying lake storage, as file groups and file slices. This config controls how such a view is held. Options include MEMORY,SPILLABLE_DISK,EMBEDDED_KV_STORE,REMOTE_ONLY,REMOTE_FIRST which provide different trade offs for memory usage and API request performance.

`Config Param: VIEW_TYPE` | --- @@ -544,24 +544,24 @@ Configurations that control archival. [**Basic Configs**](#Archival-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------ | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.keep.max.commits](#hoodiekeepmaxcommits) | 30 (Optional) | Archiving service moves older entries from timeline into an archived log after each write, to keep the metadata overhead constant, even as the table size grows. This config controls the maximum number of instants to retain in the active timeline. | | -| [hoodie.keep.min.commits](#hoodiekeepmincommits) | 20 (Optional) | Similar to hoodie.keep.max.commits, but controls the minimum number of instants to retain in the active timeline. | | +| Config Name | Default | Description | +| ------------------------------------------------ | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.keep.max.commits](#hoodiekeepmaxcommits) | 30 (Optional) | Archiving service moves older entries from timeline into an archived log after each write, to keep the metadata overhead constant, even as the table size grows. This config controls the maximum number of instants to retain in the active timeline.

`Config Param: MAX_COMMITS_TO_KEEP` | +| [hoodie.keep.min.commits](#hoodiekeepmincommits) | 20 (Optional) | Similar to hoodie.keep.max.commits, but controls the minimum number of instants to retain in the active timeline.

`Config Param: MIN_COMMITS_TO_KEEP` | [**Advanced Configs**](#Archival-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------- | ------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.archive.async](#hoodiearchiveasync) | false (Optional) | Only applies when hoodie.archive.automatic is turned on. When turned on runs archiver async with writing, which can speed up overall write performance. | 0.11.0 | -| [hoodie.archive.automatic](#hoodiearchiveautomatic) | true (Optional) | When enabled, the archival table service is invoked immediately after each commit, to archive commits if we cross a maximum value of commits. It's recommended to enable this, to ensure number of active commits is bounded. | | -| [hoodie.archive.beyond.savepoint](#hoodiearchivebeyondsavepoint) | false (Optional) | If enabled, archival will proceed beyond savepoint, skipping savepoint commits. If disabled, archival will stop at the earliest savepoint commit. | 0.12.0 | -| [hoodie.archive.delete.parallelism](#hoodiearchivedeleteparallelism) | 100 (Optional) | When performing archival operation, Hudi needs to delete the files of the archived instants in the active timeline in .hoodie folder. The file deletion also happens after merging small archived files into larger ones if enabled. This config limits the Spark parallelism for deleting files in both cases, i.e., parallelism of deleting files does not go above the configured value and the parallelism is the number of files to delete if smaller than the configured value. If you see that the file deletion in archival operation is slow because of the limited parallelism, you can increase this to tune the performance. | | -| [hoodie.archive.merge.enable](#hoodiearchivemergeenable) | false (Optional) | When enable, hoodie will auto merge several small archive files into larger one. It's useful when storage scheme doesn't support append operation. | | -| [hoodie.archive.merge.files.batch.size](#hoodiearchivemergefilesbatchsize) | 10 (Optional) | The number of small archive files to be merged at once. | | -| [hoodie.archive.merge.small.file.limit.bytes](#hoodiearchivemergesmallfilelimitbytes) | 20971520 (Optional) | This config sets the archive file size limit below which an archive file becomes a candidate to be selected as such a small file. | | -| [hoodie.commits.archival.batch](#hoodiecommitsarchivalbatch) | 10 (Optional) | Archiving of instants is batched in best-effort manner, to pack more instants into a single archive log. This config controls such archival batch size. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.archive.async](#hoodiearchiveasync) | false (Optional) | Only applies when hoodie.archive.automatic is turned on. When turned on runs archiver async with writing, which can speed up overall write performance.

`Config Param: ASYNC_ARCHIVE`
`Since Version: 0.11.0` | +| [hoodie.archive.automatic](#hoodiearchiveautomatic) | true (Optional) | When enabled, the archival table service is invoked immediately after each commit, to archive commits if we cross a maximum value of commits. It's recommended to enable this, to ensure number of active commits is bounded.

`Config Param: AUTO_ARCHIVE` | +| [hoodie.archive.beyond.savepoint](#hoodiearchivebeyondsavepoint) | false (Optional) | If enabled, archival will proceed beyond savepoint, skipping savepoint commits. If disabled, archival will stop at the earliest savepoint commit.

`Config Param: ARCHIVE_BEYOND_SAVEPOINT`
`Since Version: 0.12.0` | +| [hoodie.archive.delete.parallelism](#hoodiearchivedeleteparallelism) | 100 (Optional) | When performing archival operation, Hudi needs to delete the files of the archived instants in the active timeline in .hoodie folder. The file deletion also happens after merging small archived files into larger ones if enabled. This config limits the Spark parallelism for deleting files in both cases, i.e., parallelism of deleting files does not go above the configured value and the parallelism is the number of files to delete if smaller than the configured value. If you see that the file deletion in archival operation is slow because of the limited parallelism, you can increase this to tune the performance.

`Config Param: DELETE_ARCHIVED_INSTANT_PARALLELISM_VALUE` | +| [hoodie.archive.merge.enable](#hoodiearchivemergeenable) | false (Optional) | When enable, hoodie will auto merge several small archive files into larger one. It's useful when storage scheme doesn't support append operation.

`Config Param: ARCHIVE_MERGE_ENABLE` | +| [hoodie.archive.merge.files.batch.size](#hoodiearchivemergefilesbatchsize) | 10 (Optional) | The number of small archive files to be merged at once.

`Config Param: ARCHIVE_MERGE_FILES_BATCH_SIZE` | +| [hoodie.archive.merge.small.file.limit.bytes](#hoodiearchivemergesmallfilelimitbytes) | 20971520 (Optional) | This config sets the archive file size limit below which an archive file becomes a candidate to be selected as such a small file.

`Config Param: ARCHIVE_MERGE_SMALL_FILE_LIMIT_BYTES` | +| [hoodie.commits.archival.batch](#hoodiecommitsarchivalbatch) | 10 (Optional) | Archiving of instants is batched in best-effort manner, to pack more instants into a single archive log. This config controls such archival batch size.

`Config Param: COMMITS_ARCHIVAL_BATCH_SIZE` | --- @@ -573,23 +573,23 @@ Configurations that control how you want to bootstrap your existing tables for t [**Basic Configs**](#Bootstrap-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------ | ------------------ | ---------------------------------------------------------------------- | ------------- | -| [hoodie.bootstrap.base.path](#hoodiebootstrapbasepath) | N/A **(Required)** | Base path of the dataset that needs to be bootstrapped as a Hudi table | 0.6.0 | +| Config Name | Default | Description | +| ------------------------------------------------------ | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.bootstrap.base.path](#hoodiebootstrapbasepath) | N/A **(Required)** | Base path of the dataset that needs to be bootstrapped as a Hudi table

`Config Param: BASE_PATH`
`Since Version: 0.6.0` | [**Advanced Configs**](#Bootstrap-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.bootstrap.data.queries.only](#hoodiebootstrapdataqueriesonly) | false (Optional) | Improves query performance, but queries cannot use hudi metadata fields | 0.14.0 | -| [hoodie.bootstrap.full.input.provider](#hoodiebootstrapfullinputprovider) | org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider (Optional) | Class to use for reading the bootstrap dataset partitions/files, for Bootstrap mode FULL_RECORD | 0.6.0 | -| [hoodie.bootstrap.index.class](#hoodiebootstrapindexclass) | org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex (Optional) | Implementation to use, for mapping a skeleton base file to a bootstrap base file. | 0.6.0 | -| [hoodie.bootstrap.mode.selector](#hoodiebootstrapmodeselector) | org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector (Optional) | Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped | 0.6.0 | -| [hoodie.bootstrap.mode.selector.regex](#hoodiebootstrapmodeselectorregex) | .* (Optional) | Matches each bootstrap dataset partition against this regex and applies the mode below to it. | 0.6.0 | -| [hoodie.bootstrap.mode.selector.regex.mode](#hoodiebootstrapmodeselectorregexmode) | METADATA_ONLY (Optional) | org.apache.hudi.client.bootstrap.BootstrapMode: Bootstrap mode for importing an existing table into Hudi FULL_RECORD: In this mode, the full record data is copied into hudi and metadata columns are added. A full record bootstrap is functionally equivalent to a bulk-insert. After a full record bootstrap, Hudi will function properly even if the original table is modified or deleted. METADATA_ONLY(default): In this mode, the full record data is not copied into Hudi therefore it avoids full cost of rewriting the dataset. Instead, 'skeleton' files containing just the corresponding metadata columns are added to the Hudi table. Hudi relies on the data in the original table and will face data-loss or corruption if files in the original table location are deleted or modified. | 0.6.0 | -| [hoodie.bootstrap.parallelism](#hoodiebootstrapparallelism) | 1500 (Optional) | For metadata-only bootstrap, Hudi parallelizes the operation so that each table partition is handled by one Spark task. This config limits the number of parallelism. We pick the configured parallelism if the number of table partitions is larger than this configured value. The parallelism is assigned to the number of table partitions if it is smaller than the configured value. For full-record bootstrap, i.e., BULK_INSERT operation of the records, this configured value is passed as the BULK_INSERT shuffle parallelism (`hoodie.bulkinsert.shuffle.parallelism`), determining the BULK_INSERT write behavior. If you see that the bootstrap is slow due to the limited parallelism, you can increase this. | 0.6.0 | -| [hoodie.bootstrap.partitionpath.translator.class](#hoodiebootstrappartitionpathtranslatorclass) | org.apache.hudi.client.bootstrap.translator.IdentityBootstrapPartitionPathTranslator (Optional) | Translates the partition paths from the bootstrapped data into how is laid out as a Hudi table. | 0.6.0 | +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.bootstrap.data.queries.only](#hoodiebootstrapdataqueriesonly) | false (Optional) | Improves query performance, but queries cannot use hudi metadata fields

`Config Param: DATA_QUERIES_ONLY`
`Since Version: 0.14.0` | +| [hoodie.bootstrap.full.input.provider](#hoodiebootstrapfullinputprovider) | org.apache.hudi.bootstrap.SparkParquetBootstrapDataProvider (Optional) | Class to use for reading the bootstrap dataset partitions/files, for Bootstrap mode FULL_RECORD

`Config Param: FULL_BOOTSTRAP_INPUT_PROVIDER_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.index.class](#hoodiebootstrapindexclass) | org.apache.hudi.common.bootstrap.index.HFileBootstrapIndex (Optional) | Implementation to use, for mapping a skeleton base file to a bootstrap base file.

`Config Param: INDEX_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.mode.selector](#hoodiebootstrapmodeselector) | org.apache.hudi.client.bootstrap.selector.MetadataOnlyBootstrapModeSelector (Optional) | Selects the mode in which each file/partition in the bootstrapped dataset gets bootstrapped

`Config Param: MODE_SELECTOR_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.mode.selector.regex](#hoodiebootstrapmodeselectorregex) | .* (Optional) | Matches each bootstrap dataset partition against this regex and applies the mode below to it.

`Config Param: PARTITION_SELECTOR_REGEX_PATTERN`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.mode.selector.regex.mode](#hoodiebootstrapmodeselectorregexmode) | METADATA_ONLY (Optional) | org.apache.hudi.client.bootstrap.BootstrapMode: Bootstrap mode for importing an existing table into Hudi FULL_RECORD: In this mode, the full record data is copied into hudi and metadata columns are added. A full record bootstrap is functionally equivalent to a bulk-insert. After a full record bootstrap, Hudi will function properly even if the original table is modified or deleted. METADATA_ONLY(default): In this mode, the full record data is not copied into Hudi therefore it avoids full cost of rewriting the dataset. Instead, 'skeleton' files containing just the corresponding metadata columns are added to the Hudi table. Hudi relies on the data in the original table and will face data-loss or corruption if files in the original table location are deleted or modified.

`Config Param: PARTITION_SELECTOR_REGEX_MODE`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.parallelism](#hoodiebootstrapparallelism) | 1500 (Optional) | For metadata-only bootstrap, Hudi parallelizes the operation so that each table partition is handled by one Spark task. This config limits the number of parallelism. We pick the configured parallelism if the number of table partitions is larger than this configured value. The parallelism is assigned to the number of table partitions if it is smaller than the configured value. For full-record bootstrap, i.e., BULK_INSERT operation of the records, this configured value is passed as the BULK_INSERT shuffle parallelism (`hoodie.bulkinsert.shuffle.parallelism`), determining the BULK_INSERT write behavior. If you see that the bootstrap is slow due to the limited parallelism, you can increase this.

`Config Param: PARALLELISM_VALUE`
`Since Version: 0.6.0` | +| [hoodie.bootstrap.partitionpath.translator.class](#hoodiebootstrappartitionpathtranslatorclass) | org.apache.hudi.client.bootstrap.translator.IdentityBootstrapPartitionPathTranslator (Optional) | Translates the partition paths from the bootstrapped data into how is laid out as a Hudi table.

`Config Param: PARTITION_PATH_TRANSLATOR_CLASS_NAME`
`Since Version: 0.6.0` | --- @@ -601,27 +601,27 @@ Cleaning (reclamation of older/unused file groups/slices). [**Basic Configs**](#Clean-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------- | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.clean.async](#hoodiecleanasync) | false (Optional) | Only applies when hoodie.clean.automatic is turned on. When turned on runs cleaner async with writing, which can speed up overall write performance. | | -| [hoodie.cleaner.commits.retained](#hoodiecleanercommitsretained) | 10 (Optional) | Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries. | | +| Config Name | Default | Description | +| ---------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.clean.async](#hoodiecleanasync) | false (Optional) | Only applies when hoodie.clean.automatic is turned on. When turned on runs cleaner async with writing, which can speed up overall write performance.

`Config Param: ASYNC_CLEAN` | +| [hoodie.cleaner.commits.retained](#hoodiecleanercommitsretained) | 10 (Optional) | Number of commits to retain, without cleaning. This will be retained for num_of_commits * time_between_commits (scheduled). This also directly translates into how much data retention the table supports for incremental queries.

`Config Param: CLEANER_COMMITS_RETAINED` | [**Advanced Configs**](#Clean-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------- | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------- | -| [hoodie.clean.allow.multiple](#hoodiecleanallowmultiple) | true (Optional) | Allows scheduling/executing multiple cleans by enabling this config. If users prefer to strictly ensure clean requests should be mutually exclusive, .i.e. a 2nd clean will not be scheduled if another clean is not yet completed to avoid repeat cleaning of same files, they might want to disable this config. | 0.11.0 | -| [hoodie.clean.automatic](#hoodiecleanautomatic) | true (Optional) | When enabled, the cleaner table service is invoked immediately after each commit, to delete older file slices. It's recommended to enable this, to ensure metadata and data storage growth is bounded. | | -| [hoodie.clean.max.commits](#hoodiecleanmaxcommits) | 1 (Optional) | Number of commits after the last clean operation, before scheduling of a new clean is attempted. | | -| [hoodie.clean.trigger.strategy](#hoodiecleantriggerstrategy) | NUM_COMMITS (Optional) | org.apache.hudi.table.action.clean.CleaningTriggerStrategy: Controls when cleaning is scheduled. NUM_COMMITS(default): Trigger the cleaning service every N commits, determined by `hoodie.clean.max.commits`. | | -| [hoodie.cleaner.delete.bootstrap.base.file](#hoodiecleanerdeletebootstrapbasefile) | false (Optional) | When set to true, cleaner also deletes the bootstrap base file when it's skeleton base file is cleaned. Turn this to true, if you want to ensure the bootstrap dataset storage is reclaimed over time, as the table receives updates/deletes. Another reason to turn this on, would be to ensure data residing in bootstrap base files are also physically deleted, to comply with data privacy enforcement processes. | | -| [hoodie.cleaner.fileversions.retained](#hoodiecleanerfileversionsretained) | 3 (Optional) | When KEEP_LATEST_FILE_VERSIONS cleaning policy is used, the minimum number of file slices to retain in each file group, during cleaning. | | -| [hoodie.cleaner.hours.retained](#hoodiecleanerhoursretained) | 24 (Optional) | Number of hours for which commits need to be retained. This config provides a more flexible option ascompared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group, corresponding to commits with commit times older than the configured number of hours to be retained are cleaned. | | -| [hoodie.cleaner.incremental.mode](#hoodiecleanerincrementalmode) | true (Optional) | When enabled, the plans for each cleaner service run is computed incrementally off the events in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full table for each planning (even with a metadata table). | | -| [hoodie.cleaner.parallelism](#hoodiecleanerparallelism) | 200 (Optional) | This config controls the behavior of both the cleaning plan and cleaning execution. Deriving the cleaning plan is parallelized at the table partition level, i.e., each table partition is processed by one Spark task to figure out the files to clean. The cleaner picks the configured parallelism if the number of table partitions is larger than this configured value. The parallelism is assigned to the number of table partitions if it is smaller than the configured value. The clean execution, i.e., the file deletion, is parallelized at file level, which is the unit of Spark task distribution. Similarly, the actual parallelism cannot exceed the configured value if the number of files is larger. If cleaning plan or execution is slow due to limited parallelism, you can increase this to tune the performance.. | | -| [hoodie.cleaner.policy](#hoodiecleanerpolicy) | KEEP_LATEST_COMMITS (Optional) | org.apache.hudi.common.model.HoodieCleaningPolicy: Cleaning policy to be used. The cleaner service deletes older file slices files to re-claim space. Long running query plans may often refer to older file slices and will break if those are cleaned, before the query has had a chance to run. So, it is good to make sure that the data is retained for more than the maximum query execution time. By default, the cleaning policy is determined based on one of the following configs explicitly set by the user (at most one of them can be set; otherwise, KEEP_LATEST_COMMITS cleaning policy is used). KEEP_LATEST_FILE_VERSIONS: keeps the last N versions of the file slices written; used when "hoodie.cleaner.fileversions.retained" is explicitly set only. KEEP_LATEST_COMMITS(default): keeps the file slices written by the last N commits; used when "hoodie.cleaner.commits.retained" is explicitly set only. KEEP_LATEST_BY_HOURS: keeps the file slices written in the last N hours based on the commit time; used when "hoodie.cleaner.hours.retained" is explicitly set only. | | -| [hoodie.cleaner.policy.failed.writes](#hoodiecleanerpolicyfailedwrites) | EAGER (Optional) | org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy: Policy that controls how to clean up failed writes. Hudi will delete any files written by failed writes to re-claim space. EAGER(default): Clean failed writes inline after every write operation. LAZY: Clean failed writes lazily after heartbeat timeout when the cleaning service runs. This policy is required when multi-writers are enabled. NEVER: Never clean failed writes. | | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------- | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.clean.allow.multiple](#hoodiecleanallowmultiple) | true (Optional) | Allows scheduling/executing multiple cleans by enabling this config. If users prefer to strictly ensure clean requests should be mutually exclusive, .i.e. a 2nd clean will not be scheduled if another clean is not yet completed to avoid repeat cleaning of same files, they might want to disable this config.

`Config Param: ALLOW_MULTIPLE_CLEANS`
`Since Version: 0.11.0` | +| [hoodie.clean.automatic](#hoodiecleanautomatic) | true (Optional) | When enabled, the cleaner table service is invoked immediately after each commit, to delete older file slices. It's recommended to enable this, to ensure metadata and data storage growth is bounded.

`Config Param: AUTO_CLEAN` | +| [hoodie.clean.max.commits](#hoodiecleanmaxcommits) | 1 (Optional) | Number of commits after the last clean operation, before scheduling of a new clean is attempted.

`Config Param: CLEAN_MAX_COMMITS` | +| [hoodie.clean.trigger.strategy](#hoodiecleantriggerstrategy) | NUM_COMMITS (Optional) | org.apache.hudi.table.action.clean.CleaningTriggerStrategy: Controls when cleaning is scheduled. NUM_COMMITS(default): Trigger the cleaning service every N commits, determined by `hoodie.clean.max.commits`.

`Config Param: CLEAN_TRIGGER_STRATEGY` | +| [hoodie.cleaner.delete.bootstrap.base.file](#hoodiecleanerdeletebootstrapbasefile) | false (Optional) | When set to true, cleaner also deletes the bootstrap base file when it's skeleton base file is cleaned. Turn this to true, if you want to ensure the bootstrap dataset storage is reclaimed over time, as the table receives updates/deletes. Another reason to turn this on, would be to ensure data residing in bootstrap base files are also physically deleted, to comply with data privacy enforcement processes.

`Config Param: CLEANER_BOOTSTRAP_BASE_FILE_ENABLE` | +| [hoodie.cleaner.fileversions.retained](#hoodiecleanerfileversionsretained) | 3 (Optional) | When KEEP_LATEST_FILE_VERSIONS cleaning policy is used, the minimum number of file slices to retain in each file group, during cleaning.

`Config Param: CLEANER_FILE_VERSIONS_RETAINED` | +| [hoodie.cleaner.hours.retained](#hoodiecleanerhoursretained) | 24 (Optional) | Number of hours for which commits need to be retained. This config provides a more flexible option ascompared to number of commits retained for cleaning service. Setting this property ensures all the files, but the latest in a file group, corresponding to commits with commit times older than the configured number of hours to be retained are cleaned.

`Config Param: CLEANER_HOURS_RETAINED` | +| [hoodie.cleaner.incremental.mode](#hoodiecleanerincrementalmode) | true (Optional) | When enabled, the plans for each cleaner service run is computed incrementally off the events in the timeline, since the last cleaner run. This is much more efficient than obtaining listings for the full table for each planning (even with a metadata table).

`Config Param: CLEANER_INCREMENTAL_MODE_ENABLE` | +| [hoodie.cleaner.parallelism](#hoodiecleanerparallelism) | 200 (Optional) | This config controls the behavior of both the cleaning plan and cleaning execution. Deriving the cleaning plan is parallelized at the table partition level, i.e., each table partition is processed by one Spark task to figure out the files to clean. The cleaner picks the configured parallelism if the number of table partitions is larger than this configured value. The parallelism is assigned to the number of table partitions if it is smaller than the configured value. The clean execution, i.e., the file deletion, is parallelized at file level, which is the unit of Spark task distribution. Similarly, the actual parallelism cannot exceed the configured value if the number of files is larger. If cleaning plan or execution is slow due to limited parallelism, you can increase this to tune the performance..

`Config Param: CLEANER_PARALLELISM_VALUE` | +| [hoodie.cleaner.policy](#hoodiecleanerpolicy) | KEEP_LATEST_COMMITS (Optional) | org.apache.hudi.common.model.HoodieCleaningPolicy: Cleaning policy to be used. The cleaner service deletes older file slices files to re-claim space. Long running query plans may often refer to older file slices and will break if those are cleaned, before the query has had a chance to run. So, it is good to make sure that the data is retained for more than the maximum query execution time. By default, the cleaning policy is determined based on one of the following configs explicitly set by the user (at most one of them can be set; otherwise, KEEP_LATEST_COMMITS cleaning policy is used). KEEP_LATEST_FILE_VERSIONS: keeps the last N versions of the file slices written; used when "hoodie.cleaner.fileversions.retained" is explicitly set only. KEEP_LATEST_COMMITS(default): keeps the file slices written by the last N commits; used when "hoodie.cleaner.commits.retained" is explicitly set only. KEEP_LATEST_BY_HOURS: keeps the file slices written in the last N hours based on the commit time; used when "hoodie.cleaner.hours.retained" is explicitly set only.

`Config Param: CLEANER_POLICY` | +| [hoodie.cleaner.policy.failed.writes](#hoodiecleanerpolicyfailedwrites) | EAGER (Optional) | org.apache.hudi.common.model.HoodieFailedWritesCleaningPolicy: Policy that controls how to clean up failed writes. Hudi will delete any files written by failed writes to re-claim space. EAGER(default): Clean failed writes inline after every write operation. LAZY: Clean failed writes lazily after heartbeat timeout when the cleaning service runs. This policy is required when multi-writers are enabled. NEVER: Never clean failed writes.

`Config Param: FAILED_WRITES_CLEANER_POLICY` | --- @@ -633,42 +633,42 @@ Configurations that control the clustering table service in hudi, which optimize [**Basic Configs**](#Clustering-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------------------------------- | --------------------- | ----------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table. | 0.7.0 | -| [hoodie.clustering.inline](#hoodieclusteringinline) | false (Optional) | Turn on inline clustering - clustering will be run after each write operation is complete | 0.7.0 | -| [hoodie.clustering.plan.strategy.small.file.limit](#hoodieclusteringplanstrategysmallfilelimit) | 314572800 (Optional) | Files smaller than the size in bytes specified here are candidates for clustering | 0.7.0 | -| [hoodie.clustering.plan.strategy.target.file.max.bytes](#hoodieclusteringplanstrategytargetfilemaxbytes) | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups | 0.7.0 | +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.clustering.async.enabled](#hoodieclusteringasyncenabled) | false (Optional) | Enable running of clustering service, asynchronously as inserts happen on the table.

`Config Param: ASYNC_CLUSTERING_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline](#hoodieclusteringinline) | false (Optional) | Turn on inline clustering - clustering will be run after each write operation is complete

`Config Param: INLINE_CLUSTERING`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.small.file.limit](#hoodieclusteringplanstrategysmallfilelimit) | 314572800 (Optional) | Files smaller than the size in bytes specified here are candidates for clustering

`Config Param: PLAN_STRATEGY_SMALL_FILE_LIMIT`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.target.file.max.bytes](#hoodieclusteringplanstrategytargetfilemaxbytes) | 1073741824 (Optional) | Each group can produce 'N' (CLUSTERING_MAX_GROUP_SIZE/CLUSTERING_TARGET_FILE_SIZE) output file groups

`Config Param: PLAN_STRATEGY_TARGET_FILE_MAX_BYTES`
`Since Version: 0.7.0` | [**Advanced Configs**](#Clustering-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------- | -| [hoodie.clustering.plan.strategy.cluster.begin.partition](#hoodieclusteringplanstrategyclusterbeginpartition) | N/A **(Required)** | Begin partition used to filter partition (inclusive), only effective when the filter mode 'hoodie.clustering.plan.partition.filter.mode' is SELECTED_PARTITIONS | 0.11.0 | -| [hoodie.clustering.plan.strategy.cluster.end.partition](#hoodieclusteringplanstrategyclusterendpartition) | N/A **(Required)** | End partition used to filter partition (inclusive), only effective when the filter mode 'hoodie.clustering.plan.partition.filter.mode' is SELECTED_PARTITIONS | 0.11.0 | -| [hoodie.clustering.plan.strategy.partition.regex.pattern](#hoodieclusteringplanstrategypartitionregexpattern) | N/A **(Required)** | Filter clustering partitions that matched regex pattern | 0.11.0 | -| [hoodie.clustering.plan.strategy.partition.selected](#hoodieclusteringplanstrategypartitionselected) | N/A **(Required)** | Partitions to run clustering | 0.11.0 | -| [hoodie.clustering.plan.strategy.sort.columns](#hoodieclusteringplanstrategysortcolumns) | N/A **(Required)** | Columns to sort the data by when clustering | 0.7.0 | -| [hoodie.clustering.async.max.commits](#hoodieclusteringasyncmaxcommits) | 4 (Optional) | Config to control frequency of async clustering | 0.9.0 | -| [hoodie.clustering.execution.strategy.class](#hoodieclusteringexecutionstrategyclass) | org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy (Optional) | Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while meeting the configured target file sizes. | 0.7.0 | -| [hoodie.clustering.inline.max.commits](#hoodieclusteringinlinemaxcommits) | 4 (Optional) | Config to control frequency of clustering planning | 0.7.0 | -| [hoodie.clustering.max.parallelism](#hoodieclusteringmaxparallelism) | 15 (Optional) | Maximum number of parallelism jobs submitted in clustering operation. If the resource is sufficient(Like Spark engine has enough idle executors), increasing this value will let the clustering job run faster, while it will give additional pressure to the execution engines to manage more concurrent running jobs. | 0.14.0 | -| [hoodie.clustering.plan.partition.filter.mode](#hoodieclusteringplanpartitionfiltermode) | NONE (Optional) | org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode: Partition filter mode used in the creation of clustering plan. NONE(default): Do not filter partitions. The clustering plan will include all partitions that have clustering candidates. RECENT_DAYS: This filter assumes that your data is partitioned by date. The clustering plan will only include partitions from K days ago to N days ago, where K >= N. K is determined by `hoodie.clustering.plan.strategy.daybased.lookback.partitions` and N is determined by `hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions`. SELECTED_PARTITIONS: The clustering plan will include only partition paths with names that sort within the inclusive range [`hoodie.clustering.plan.strategy.cluster.begin.partition`, `hoodie.clustering.plan.strategy.cluster.end.partition`]. DAY_ROLLING: To determine the partitions in the clustering plan, the eligible partitions will be sorted in ascending order. Each partition will have an index i in that list. The clustering plan will only contain partitions such that i mod 24 = H, where H is the current hour of the day (from 0 to 23). | 0.11.0 | -| [hoodie.clustering.plan.strategy.class](#hoodieclusteringplanstrategyclass) | org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy (Optional) | Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan i.e select what file groups are being clustered. Default strategy, looks at the clustering small file size limit (determined by hoodie.clustering.plan.strategy.small.file.limit) to pick the small file slices within partitions for clustering. | 0.7.0 | -| [hoodie.clustering.plan.strategy.daybased.lookback.partitions](#hoodieclusteringplanstrategydaybasedlookbackpartitions) | 2 (Optional) | Number of partitions to list to create ClusteringPlan | 0.7.0 | -| [hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions](#hoodieclusteringplanstrategydaybasedskipfromlatestpartitions) | 0 (Optional) | Number of partitions to skip from latest when choosing partitions to create ClusteringPlan | 0.9.0 | -| [hoodie.clustering.plan.strategy.max.bytes.per.group](#hoodieclusteringplanstrategymaxbytespergroup) | 2147483648 (Optional) | Each clustering operation can create multiple output file groups. Total amount of data processed by clustering operation is defined by below two properties (CLUSTERING_MAX_BYTES_PER_GROUP * CLUSTERING_MAX_NUM_GROUPS). Max amount of data to be included in one group | 0.7.0 | -| [hoodie.clustering.plan.strategy.max.num.groups](#hoodieclusteringplanstrategymaxnumgroups) | 30 (Optional) | Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism | 0.7.0 | -| [hoodie.clustering.plan.strategy.single.group.clustering.enabled](#hoodieclusteringplanstrategysinglegroupclusteringenabled) | true (Optional) | Whether to generate clustering plan when there is only one file group involved, by default true | 0.14.0 | -| [hoodie.clustering.rollback.pending.replacecommit.on.conflict](#hoodieclusteringrollbackpendingreplacecommitonconflict) | false (Optional) | If updates are allowed to file groups pending clustering, then set this config to rollback failed or pending clustering instants. Pending clustering will be rolled back ONLY IF there is conflict between incoming upsert and filegroup to be clustered. Please exercise caution while setting this config, especially when clustering is done very frequently. This could lead to race condition in rare scenarios, for example, when the clustering completes after instants are fetched but before rollback completed. | 0.10.0 | -| [hoodie.clustering.schedule.inline](#hoodieclusteringscheduleinline) | false (Optional) | When set to true, clustering service will be attempted for inline scheduling after each write. Users have to ensure they have a separate job to run async clustering(execution) for the one scheduled by this writer. Users can choose to set both `hoodie.clustering.inline` and `hoodie.clustering.schedule.inline` to false and have both scheduling and execution triggered by any async process, on which case `hoodie.clustering.async.enabled` is expected to be set to true. But if `hoodie.clustering.inline` is set to false, and `hoodie.clustering.schedule.inline` is set to true, regular writers will schedule clustering inline, but users are expected to trigger async job for execution. If `hoodie.clustering.inline` is set to true, regular writers will do both scheduling and execution inline for clustering | | -| [hoodie.clustering.updates.strategy](#hoodieclusteringupdatesstrategy) | org.apache.hudi.client.clustering.update.strategy.SparkRejectUpdateStrategy (Optional) | Determines how to handle updates, deletes to file groups that are under clustering. Default strategy just rejects the update | 0.7.0 | -| [hoodie.layout.optimize.build.curve.sample.size](#hoodielayoutoptimizebuildcurvesamplesize) | 200000 (Optional) | Determines target sample size used by the Boundary-based Interleaved Index method of building space-filling curve. Larger sample size entails better layout optimization outcomes, at the expense of higher memory footprint. | 0.10.0 | -| [hoodie.layout.optimize.curve.build.method](#hoodielayoutoptimizecurvebuildmethod) | DIRECT (Optional) | org.apache.hudi.config.HoodieClusteringConfig$SpatialCurveCompositionStrategyType: This configuration only has effect if `hoodie.layout.optimize.strategy` is set to either "z-order" or "hilbert" (i.e. leveraging space-filling curves). This configuration controls the type of a strategy to use for building the space-filling curves, tackling specifically how the Strings are ordered based on the curve. Since we truncate the String to 8 bytes for ordering, there are two issues: (1) it can lead to poor aggregation effect, (2) the truncation of String longer than 8 bytes loses the precision, if the Strings are different but the 8-byte prefix is the same. The boundary-based interleaved index method ("SAMPLE") has better generalization, solving the two problems above, but is slower than direct method ("DIRECT"). User should benchmark the write and query performance before tweaking this in production, if this is actually a problem. Please refer to RFC-28 for more details. DIRECT(default): This strategy builds the spatial curve in full, filling in all of the individual points corresponding to each individual record, which requires less compute. SAMPLE: This strategy leverages boundary-base interleaved index method (described in more details in Amazon DynamoDB blog https://aws.amazon.com/cn/blogs/database/tag/z-order/) and produces a better layout compared to DIRECT strategy. It requires more compute and is slower. | 0.10.0 | -| [hoodie.layout.optimize.data.skipping.enable](#hoodielayoutoptimizedataskippingenable) | true (Optional) | Enable data skipping by collecting statistics once layout optimization is complete. | 0.10.0. Deprecated since: 0.11.0 | -| [hoodie.layout.optimize.enable](#hoodielayoutoptimizeenable) | false (Optional) | This setting has no effect. Please refer to clustering configuration, as well as LAYOUT_OPTIMIZE_STRATEGY config to enable advanced record layout optimization strategies | 0.10.0. Deprecated since: 0.11.0 | -| [hoodie.layout.optimize.strategy](#hoodielayoutoptimizestrategy) | linear (Optional) | Determines ordering strategy used in records layout optimization. Currently supported strategies are "linear", "z-order" and "hilbert" values are supported. | 0.10.0 | +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.clustering.plan.strategy.cluster.begin.partition](#hoodieclusteringplanstrategyclusterbeginpartition) | N/A **(Required)** | Begin partition used to filter partition (inclusive), only effective when the filter mode 'hoodie.clustering.plan.partition.filter.mode' is SELECTED_PARTITIONS

`Config Param: PARTITION_FILTER_BEGIN_PARTITION`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.cluster.end.partition](#hoodieclusteringplanstrategyclusterendpartition) | N/A **(Required)** | End partition used to filter partition (inclusive), only effective when the filter mode 'hoodie.clustering.plan.partition.filter.mode' is SELECTED_PARTITIONS

`Config Param: PARTITION_FILTER_END_PARTITION`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.partition.regex.pattern](#hoodieclusteringplanstrategypartitionregexpattern) | N/A **(Required)** | Filter clustering partitions that matched regex pattern

`Config Param: PARTITION_REGEX_PATTERN`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.partition.selected](#hoodieclusteringplanstrategypartitionselected) | N/A **(Required)** | Partitions to run clustering

`Config Param: PARTITION_SELECTED`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.sort.columns](#hoodieclusteringplanstrategysortcolumns) | N/A **(Required)** | Columns to sort the data by when clustering

`Config Param: PLAN_STRATEGY_SORT_COLUMNS`
`Since Version: 0.7.0` | +| [hoodie.clustering.async.max.commits](#hoodieclusteringasyncmaxcommits) | 4 (Optional) | Config to control frequency of async clustering

`Config Param: ASYNC_CLUSTERING_MAX_COMMITS`
`Since Version: 0.9.0` | +| [hoodie.clustering.execution.strategy.class](#hoodieclusteringexecutionstrategyclass) | org.apache.hudi.client.clustering.run.strategy.SparkSortAndSizeExecutionStrategy (Optional) | Config to provide a strategy class (subclass of RunClusteringStrategy) to define how the clustering plan is executed. By default, we sort the file groups in th plan by the specified columns, while meeting the configured target file sizes.

`Config Param: EXECUTION_STRATEGY_CLASS_NAME`
`Since Version: 0.7.0` | +| [hoodie.clustering.inline.max.commits](#hoodieclusteringinlinemaxcommits) | 4 (Optional) | Config to control frequency of clustering planning

`Config Param: INLINE_CLUSTERING_MAX_COMMITS`
`Since Version: 0.7.0` | +| [hoodie.clustering.max.parallelism](#hoodieclusteringmaxparallelism) | 15 (Optional) | Maximum number of parallelism jobs submitted in clustering operation. If the resource is sufficient(Like Spark engine has enough idle executors), increasing this value will let the clustering job run faster, while it will give additional pressure to the execution engines to manage more concurrent running jobs.

`Config Param: CLUSTERING_MAX_PARALLELISM`
`Since Version: 0.14.0` | +| [hoodie.clustering.plan.partition.filter.mode](#hoodieclusteringplanpartitionfiltermode) | NONE (Optional) | org.apache.hudi.table.action.cluster.ClusteringPlanPartitionFilterMode: Partition filter mode used in the creation of clustering plan. NONE(default): Do not filter partitions. The clustering plan will include all partitions that have clustering candidates. RECENT_DAYS: This filter assumes that your data is partitioned by date. The clustering plan will only include partitions from K days ago to N days ago, where K >= N. K is determined by `hoodie.clustering.plan.strategy.daybased.lookback.partitions` and N is determined by `hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions`. SELECTED_PARTITIONS: The clustering plan will include only partition paths with names that sort within the inclusive range [`hoodie.clustering.plan.strategy.cluster.begin.partition`, `hoodie.clustering.plan.strategy.cluster.end.partition`]. DAY_ROLLING: To determine the partitions in the clustering plan, the eligible partitions will be sorted in ascending order. Each partition will have an index i in that list. The clustering plan will only contain partitions such that i mod 24 = H, where H is the current hour of the day (from 0 to 23).

`Config Param: PLAN_PARTITION_FILTER_MODE_NAME`
`Since Version: 0.11.0` | +| [hoodie.clustering.plan.strategy.class](#hoodieclusteringplanstrategyclass) | org.apache.hudi.client.clustering.plan.strategy.SparkSizeBasedClusteringPlanStrategy (Optional) | Config to provide a strategy class (subclass of ClusteringPlanStrategy) to create clustering plan i.e select what file groups are being clustered. Default strategy, looks at the clustering small file size limit (determined by hoodie.clustering.plan.strategy.small.file.limit) to pick the small file slices within partitions for clustering.

`Config Param: PLAN_STRATEGY_CLASS_NAME`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.daybased.lookback.partitions](#hoodieclusteringplanstrategydaybasedlookbackpartitions) | 2 (Optional) | Number of partitions to list to create ClusteringPlan

`Config Param: DAYBASED_LOOKBACK_PARTITIONS`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.daybased.skipfromlatest.partitions](#hoodieclusteringplanstrategydaybasedskipfromlatestpartitions) | 0 (Optional) | Number of partitions to skip from latest when choosing partitions to create ClusteringPlan

`Config Param: PLAN_STRATEGY_SKIP_PARTITIONS_FROM_LATEST`
`Since Version: 0.9.0` | +| [hoodie.clustering.plan.strategy.max.bytes.per.group](#hoodieclusteringplanstrategymaxbytespergroup) | 2147483648 (Optional) | Each clustering operation can create multiple output file groups. Total amount of data processed by clustering operation is defined by below two properties (CLUSTERING_MAX_BYTES_PER_GROUP * CLUSTERING_MAX_NUM_GROUPS). Max amount of data to be included in one group

`Config Param: PLAN_STRATEGY_MAX_BYTES_PER_OUTPUT_FILEGROUP`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.max.num.groups](#hoodieclusteringplanstrategymaxnumgroups) | 30 (Optional) | Maximum number of groups to create as part of ClusteringPlan. Increasing groups will increase parallelism

`Config Param: PLAN_STRATEGY_MAX_GROUPS`
`Since Version: 0.7.0` | +| [hoodie.clustering.plan.strategy.single.group.clustering.enabled](#hoodieclusteringplanstrategysinglegroupclusteringenabled) | true (Optional) | Whether to generate clustering plan when there is only one file group involved, by default true

`Config Param: PLAN_STRATEGY_SINGLE_GROUP_CLUSTERING_ENABLED`
`Since Version: 0.14.0` | +| [hoodie.clustering.rollback.pending.replacecommit.on.conflict](#hoodieclusteringrollbackpendingreplacecommitonconflict) | false (Optional) | If updates are allowed to file groups pending clustering, then set this config to rollback failed or pending clustering instants. Pending clustering will be rolled back ONLY IF there is conflict between incoming upsert and filegroup to be clustered. Please exercise caution while setting this config, especially when clustering is done very frequently. This could lead to race condition in rare scenarios, for example, when the clustering completes after instants are fetched but before rollback completed.

`Config Param: ROLLBACK_PENDING_CLUSTERING_ON_CONFLICT`
`Since Version: 0.10.0` | +| [hoodie.clustering.schedule.inline](#hoodieclusteringscheduleinline) | false (Optional) | When set to true, clustering service will be attempted for inline scheduling after each write. Users have to ensure they have a separate job to run async clustering(execution) for the one scheduled by this writer. Users can choose to set both `hoodie.clustering.inline` and `hoodie.clustering.schedule.inline` to false and have both scheduling and execution triggered by any async process, on which case `hoodie.clustering.async.enabled` is expected to be set to true. But if `hoodie.clustering.inline` is set to false, and `hoodie.clustering.schedule.inline` is set to true, regular writers will schedule clustering inline, but users are expected to trigger async job for execution. If `hoodie.clustering.inline` is set to true, regular writers will do both scheduling and execution inline for clustering

`Config Param: SCHEDULE_INLINE_CLUSTERING` | +| [hoodie.clustering.updates.strategy](#hoodieclusteringupdatesstrategy) | org.apache.hudi.client.clustering.update.strategy.SparkRejectUpdateStrategy (Optional) | Determines how to handle updates, deletes to file groups that are under clustering. Default strategy just rejects the update

`Config Param: UPDATES_STRATEGY`
`Since Version: 0.7.0` | +| [hoodie.layout.optimize.build.curve.sample.size](#hoodielayoutoptimizebuildcurvesamplesize) | 200000 (Optional) | Determines target sample size used by the Boundary-based Interleaved Index method of building space-filling curve. Larger sample size entails better layout optimization outcomes, at the expense of higher memory footprint.

`Config Param: LAYOUT_OPTIMIZE_BUILD_CURVE_SAMPLE_SIZE`
`Since Version: 0.10.0` | +| [hoodie.layout.optimize.curve.build.method](#hoodielayoutoptimizecurvebuildmethod) | DIRECT (Optional) | org.apache.hudi.config.HoodieClusteringConfig$SpatialCurveCompositionStrategyType: This configuration only has effect if `hoodie.layout.optimize.strategy` is set to either "z-order" or "hilbert" (i.e. leveraging space-filling curves). This configuration controls the type of a strategy to use for building the space-filling curves, tackling specifically how the Strings are ordered based on the curve. Since we truncate the String to 8 bytes for ordering, there are two issues: (1) it can lead to poor aggregation effect, (2) the truncation of String longer than 8 bytes loses the precision, if the Strings are different but the 8-byte prefix is the same. The boundary-based interleaved index method ("SAMPLE") has better generalization, solving the two problems above, but is slower than direct method ("DIRECT"). User should benchmark the write and query performance before tweaking this in production, if this is actually a problem. Please refer to RFC-28 for more details. DIRECT(default): This strategy builds the spatial curve in full, filling in all of the individual points corresponding to each individual record, which requires less compute. SAMPLE: This strategy leverages boundary-base interleaved index method (described in more details in Amazon DynamoDB blog https://aws.amazon.com/cn/blogs/database/tag/z-order/) and produces a better layout compared to DIRECT strategy. It requires more compute and is slower.

`Config Param: LAYOUT_OPTIMIZE_SPATIAL_CURVE_BUILD_METHOD`
`Since Version: 0.10.0` | +| [hoodie.layout.optimize.data.skipping.enable](#hoodielayoutoptimizedataskippingenable) | true (Optional) | Enable data skipping by collecting statistics once layout optimization is complete.

`Config Param: LAYOUT_OPTIMIZE_DATA_SKIPPING_ENABLE`
`Since Version: 0.10.0`
`Deprecated since: 0.11.0` | +| [hoodie.layout.optimize.enable](#hoodielayoutoptimizeenable) | false (Optional) | This setting has no effect. Please refer to clustering configuration, as well as LAYOUT_OPTIMIZE_STRATEGY config to enable advanced record layout optimization strategies

`Config Param: LAYOUT_OPTIMIZE_ENABLE`
`Since Version: 0.10.0`
`Deprecated since: 0.11.0` | +| [hoodie.layout.optimize.strategy](#hoodielayoutoptimizestrategy) | linear (Optional) | Determines ordering strategy used in records layout optimization. Currently supported strategies are "linear", "z-order" and "hilbert" values are supported.

`Config Param: LAYOUT_OPTIMIZE_STRATEGY`
`Since Version: 0.10.0` | --- @@ -680,35 +680,35 @@ Configurations that control compaction (merging of log files onto a new base fil [**Basic Configs**](#Compaction-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.compact.inline](#hoodiecompactinline) | false (Optional) | When set to true, compaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path. | | -| [hoodie.compact.inline.max.delta.commits](#hoodiecompactinlinemaxdeltacommits) | 5 (Optional) | Number of delta commits after the last compaction, before scheduling of a new compaction is attempted. This config takes effect only for the compaction triggering strategy based on the number of commits, i.e., NUM_COMMITS, NUM_COMMITS_AFTER_LAST_REQUEST, NUM_AND_TIME, and NUM_OR_TIME. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.compact.inline](#hoodiecompactinline) | false (Optional) | When set to true, compaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path.

`Config Param: INLINE_COMPACT` | +| [hoodie.compact.inline.max.delta.commits](#hoodiecompactinlinemaxdeltacommits) | 5 (Optional) | Number of delta commits after the last compaction, before scheduling of a new compaction is attempted. This config takes effect only for the compaction triggering strategy based on the number of commits, i.e., NUM_COMMITS, NUM_COMMITS_AFTER_LAST_REQUEST, NUM_AND_TIME, and NUM_OR_TIME.

`Config Param: INLINE_COMPACT_NUM_DELTA_COMMITS` | [**Advanced Configs**](#Compaction-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.compact.inline.max.delta.seconds](#hoodiecompactinlinemaxdeltaseconds) | 3600 (Optional) | Number of elapsed seconds after the last compaction, before scheduling a new one. This config takes effect only for the compaction triggering strategy based on the elapsed time, i.e., TIME_ELAPSED, NUM_AND_TIME, and NUM_OR_TIME. | | -| [hoodie.compact.inline.trigger.strategy](#hoodiecompactinlinetriggerstrategy) | NUM_COMMITS (Optional) | org.apache.hudi.table.action.compact.CompactionTriggerStrategy: Controls when compaction is scheduled. NUM_COMMITS(default): triggers compaction when there are at least N delta commits after last completed compaction. NUM_COMMITS_AFTER_LAST_REQUEST: triggers compaction when there are at least N delta commits after last completed or requested compaction. TIME_ELAPSED: triggers compaction after N seconds since last compaction. NUM_AND_TIME: triggers compaction when both there are at least N delta commits and N seconds elapsed (both must be satisfied) after last completed compaction. NUM_OR_TIME: triggers compaction when both there are at least N delta commits or N seconds elapsed (either condition is satisfied) after last completed compaction. | | -| [hoodie.compact.schedule.inline](#hoodiecompactscheduleinline) | false (Optional) | When set to true, compaction service will be attempted for inline scheduling after each write. Users have to ensure they have a separate job to run async compaction(execution) for the one scheduled by this writer. Users can choose to set both `hoodie.compact.inline` and `hoodie.compact.schedule.inline` to false and have both scheduling and execution triggered by any async process. But if `hoodie.compact.inline` is set to false, and `hoodie.compact.schedule.inline` is set to true, regular writers will schedule compaction inline, but users are expected to trigger async job for execution. If `hoodie.compact.inline` is set to true, regular writers will do both scheduling and execution inline for compaction | | -| [hoodie.compaction.daybased.target.partitions](#hoodiecompactiondaybasedtargetpartitions) | 10 (Optional) | Used by org.apache.hudi.io.compact.strategy.DayBasedCompactionStrategy to denote the number of latest partitions to compact during a compaction run. | | -| [hoodie.compaction.lazy.block.read](#hoodiecompactionlazyblockread) | true (Optional) | When merging the delta log files, this config helps to choose whether the log blocks should be read lazily or not. Choose true to use lazy block reading (low memory usage, but incurs seeks to each block header) or false for immediate block read (higher memory usage) | | -| [hoodie.compaction.logfile.num.threshold](#hoodiecompactionlogfilenumthreshold) | 0 (Optional) | Only if the log file num is greater than the threshold, the file group will be compacted. | 0.13.0 | -| [hoodie.compaction.logfile.size.threshold](#hoodiecompactionlogfilesizethreshold) | 0 (Optional) | Only if the log file size is greater than the threshold in bytes, the file group will be compacted. | | -| [hoodie.compaction.reverse.log.read](#hoodiecompactionreverselogread) | false (Optional) | HoodieLogFormatReader reads a logfile in the forward direction starting from pos=0 to pos=file_length. If this config is set to true, the reader reads the logfile in reverse direction, from pos=file_length to pos=0 | | -| [hoodie.compaction.strategy](#hoodiecompactionstrategy) | org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy (Optional) | Compaction strategy decides which file groups are picked up for compaction during each compaction run. By default. Hudi picks the log file with most accumulated unmerged data | | -| [hoodie.compaction.target.io](#hoodiecompactiontargetio) | 512000 (Optional) | Amount of MBs to spend during compaction run for the LogFileSizeBasedCompactionStrategy. This value helps bound ingestion latency while compaction is run inline mode. | | -| [hoodie.copyonwrite.insert.auto.split](#hoodiecopyonwriteinsertautosplit) | true (Optional) | Config to control whether we control insert split sizes automatically based on average record sizes. It's recommended to keep this turned on, since hand tuning is otherwise extremely cumbersome. | | -| [hoodie.copyonwrite.insert.split.size](#hoodiecopyonwriteinsertsplitsize) | 500000 (Optional) | Number of inserts assigned for each partition/bucket for writing. We based the default on writing out 100MB files, with at least 1kb records (100K records per file), and over provision to 500K. As long as auto-tuning of splits is turned on, this only affects the first write, where there is no history to learn record sizes from. | | -| [hoodie.copyonwrite.record.size.estimate](#hoodiecopyonwriterecordsizeestimate) | 1024 (Optional) | The average record size. If not explicitly specified, hudi will compute the record size estimate compute dynamically based on commit metadata. This is critical in computing the insert parallelism and bin-packing inserts into small files. | | -| [hoodie.log.compaction.blocks.threshold](#hoodielogcompactionblocksthreshold) | 5 (Optional) | Log compaction can be scheduled if the no. of log blocks crosses this threshold value. This is effective only when log compaction is enabled via hoodie.log.compaction.inline | 0.13.0 | -| [hoodie.log.compaction.enable](#hoodielogcompactionenable) | false (Optional) | By enabling log compaction through this config, log compaction will also get enabled for the metadata table. | 0.14 | -| [hoodie.log.compaction.inline](#hoodielogcompactioninline) | false (Optional) | When set to true, logcompaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path. | 0.13.0 | -| [hoodie.optimized.log.blocks.scan.enable](#hoodieoptimizedlogblocksscanenable) | false (Optional) | New optimized scan for log blocks that handles all multi-writer use-cases while appending to log files. It also differentiates original blocks written by ingestion writers and compacted blocks written log compaction. | 0.13.0 | -| [hoodie.parquet.small.file.limit](#hoodieparquetsmallfilelimit) | 104857600 (Optional) | During upsert operation, we opportunistically expand existing small files on storage, instead of writing new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file. Also note that if this set <= 0, will not try to get small files and directly write new files | | -| [hoodie.record.size.estimation.threshold](#hoodierecordsizeestimationthreshold) | 1.0 (Optional) | We use the previous commits' metadata to calculate the estimated record size and use it to bin pack records into partitions. If the previous commit is too small to make an accurate estimation, Hudi will search commits in the reverse order, until we find a commit that has totalBytesWritten larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * this_threshold) | | +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.compact.inline.max.delta.seconds](#hoodiecompactinlinemaxdeltaseconds) | 3600 (Optional) | Number of elapsed seconds after the last compaction, before scheduling a new one. This config takes effect only for the compaction triggering strategy based on the elapsed time, i.e., TIME_ELAPSED, NUM_AND_TIME, and NUM_OR_TIME.

`Config Param: INLINE_COMPACT_TIME_DELTA_SECONDS` | +| [hoodie.compact.inline.trigger.strategy](#hoodiecompactinlinetriggerstrategy) | NUM_COMMITS (Optional) | org.apache.hudi.table.action.compact.CompactionTriggerStrategy: Controls when compaction is scheduled. NUM_COMMITS(default): triggers compaction when there are at least N delta commits after last completed compaction. NUM_COMMITS_AFTER_LAST_REQUEST: triggers compaction when there are at least N delta commits after last completed or requested compaction. TIME_ELAPSED: triggers compaction after N seconds since last compaction. NUM_AND_TIME: triggers compaction when both there are at least N delta commits and N seconds elapsed (both must be satisfied) after last completed compaction. NUM_OR_TIME: triggers compaction when both there are at least N delta commits or N seconds elapsed (either condition is satisfied) after last completed compaction.

`Config Param: INLINE_COMPACT_TRIGGER_STRATEGY` | +| [hoodie.compact.schedule.inline](#hoodiecompactscheduleinline) | false (Optional) | When set to true, compaction service will be attempted for inline scheduling after each write. Users have to ensure they have a separate job to run async compaction(execution) for the one scheduled by this writer. Users can choose to set both `hoodie.compact.inline` and `hoodie.compact.schedule.inline` to false and have both scheduling and execution triggered by any async process. But if `hoodie.compact.inline` is set to false, and `hoodie.compact.schedule.inline` is set to true, regular writers will schedule compaction inline, but users are expected to trigger async job for execution. If `hoodie.compact.inline` is set to true, regular writers will do both scheduling and execution inline for compaction

`Config Param: SCHEDULE_INLINE_COMPACT` | +| [hoodie.compaction.daybased.target.partitions](#hoodiecompactiondaybasedtargetpartitions) | 10 (Optional) | Used by org.apache.hudi.io.compact.strategy.DayBasedCompactionStrategy to denote the number of latest partitions to compact during a compaction run.

`Config Param: TARGET_PARTITIONS_PER_DAYBASED_COMPACTION` | +| [hoodie.compaction.lazy.block.read](#hoodiecompactionlazyblockread) | true (Optional) | When merging the delta log files, this config helps to choose whether the log blocks should be read lazily or not. Choose true to use lazy block reading (low memory usage, but incurs seeks to each block header) or false for immediate block read (higher memory usage)

`Config Param: COMPACTION_LAZY_BLOCK_READ_ENABLE` | +| [hoodie.compaction.logfile.num.threshold](#hoodiecompactionlogfilenumthreshold) | 0 (Optional) | Only if the log file num is greater than the threshold, the file group will be compacted.

`Config Param: COMPACTION_LOG_FILE_NUM_THRESHOLD`
`Since Version: 0.13.0` | +| [hoodie.compaction.logfile.size.threshold](#hoodiecompactionlogfilesizethreshold) | 0 (Optional) | Only if the log file size is greater than the threshold in bytes, the file group will be compacted.

`Config Param: COMPACTION_LOG_FILE_SIZE_THRESHOLD` | +| [hoodie.compaction.reverse.log.read](#hoodiecompactionreverselogread) | false (Optional) | HoodieLogFormatReader reads a logfile in the forward direction starting from pos=0 to pos=file_length. If this config is set to true, the reader reads the logfile in reverse direction, from pos=file_length to pos=0

`Config Param: COMPACTION_REVERSE_LOG_READ_ENABLE` | +| [hoodie.compaction.strategy](#hoodiecompactionstrategy) | org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy (Optional) | Compaction strategy decides which file groups are picked up for compaction during each compaction run. By default. Hudi picks the log file with most accumulated unmerged data

`Config Param: COMPACTION_STRATEGY` | +| [hoodie.compaction.target.io](#hoodiecompactiontargetio) | 512000 (Optional) | Amount of MBs to spend during compaction run for the LogFileSizeBasedCompactionStrategy. This value helps bound ingestion latency while compaction is run inline mode.

`Config Param: TARGET_IO_PER_COMPACTION_IN_MB` | +| [hoodie.copyonwrite.insert.auto.split](#hoodiecopyonwriteinsertautosplit) | true (Optional) | Config to control whether we control insert split sizes automatically based on average record sizes. It's recommended to keep this turned on, since hand tuning is otherwise extremely cumbersome.

`Config Param: COPY_ON_WRITE_AUTO_SPLIT_INSERTS` | +| [hoodie.copyonwrite.insert.split.size](#hoodiecopyonwriteinsertsplitsize) | 500000 (Optional) | Number of inserts assigned for each partition/bucket for writing. We based the default on writing out 100MB files, with at least 1kb records (100K records per file), and over provision to 500K. As long as auto-tuning of splits is turned on, this only affects the first write, where there is no history to learn record sizes from.

`Config Param: COPY_ON_WRITE_INSERT_SPLIT_SIZE` | +| [hoodie.copyonwrite.record.size.estimate](#hoodiecopyonwriterecordsizeestimate) | 1024 (Optional) | The average record size. If not explicitly specified, hudi will compute the record size estimate compute dynamically based on commit metadata. This is critical in computing the insert parallelism and bin-packing inserts into small files.

`Config Param: COPY_ON_WRITE_RECORD_SIZE_ESTIMATE` | +| [hoodie.log.compaction.blocks.threshold](#hoodielogcompactionblocksthreshold) | 5 (Optional) | Log compaction can be scheduled if the no. of log blocks crosses this threshold value. This is effective only when log compaction is enabled via hoodie.log.compaction.inline

`Config Param: LOG_COMPACTION_BLOCKS_THRESHOLD`
`Since Version: 0.13.0` | +| [hoodie.log.compaction.enable](#hoodielogcompactionenable) | false (Optional) | By enabling log compaction through this config, log compaction will also get enabled for the metadata table.

`Config Param: ENABLE_LOG_COMPACTION`
`Since Version: 0.14` | +| [hoodie.log.compaction.inline](#hoodielogcompactioninline) | false (Optional) | When set to true, logcompaction service is triggered after each write. While being simpler operationally, this adds extra latency on the write path.

`Config Param: INLINE_LOG_COMPACT`
`Since Version: 0.13.0` | +| [hoodie.optimized.log.blocks.scan.enable](#hoodieoptimizedlogblocksscanenable) | false (Optional) | New optimized scan for log blocks that handles all multi-writer use-cases while appending to log files. It also differentiates original blocks written by ingestion writers and compacted blocks written log compaction.

`Config Param: ENABLE_OPTIMIZED_LOG_BLOCKS_SCAN`
`Since Version: 0.13.0` | +| [hoodie.parquet.small.file.limit](#hoodieparquetsmallfilelimit) | 104857600 (Optional) | During upsert operation, we opportunistically expand existing small files on storage, instead of writing new files, to keep number of files to an optimum. This config sets the file size limit below which a file on storage becomes a candidate to be selected as such a `small file`. By default, treat any file <= 100MB as a small file. Also note that if this set <= 0, will not try to get small files and directly write new files

`Config Param: PARQUET_SMALL_FILE_LIMIT` | +| [hoodie.record.size.estimation.threshold](#hoodierecordsizeestimationthreshold) | 1.0 (Optional) | We use the previous commits' metadata to calculate the estimated record size and use it to bin pack records into partitions. If the previous commit is too small to make an accurate estimation, Hudi will search commits in the reverse order, until we find a commit that has totalBytesWritten larger than (PARQUET_SMALL_FILE_LIMIT_BYTES * this_threshold)

`Config Param: RECORD_SIZE_ESTIMATION_THRESHOLD` | --- @@ -720,10 +720,10 @@ Configurations that control storage layout and data distribution, which defines [**Advanced Configs**](#Layout-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.storage.layout.partitioner.class](#hoodiestoragelayoutpartitionerclass) | N/A **(Required)** | Partitioner class, it is used to distribute data in a specific way. | | -| [hoodie.storage.layout.type](#hoodiestoragelayouttype) | DEFAULT (Optional) | org.apache.hudi.table.storage.HoodieStorageLayout$LayoutType: Determines how the files are organized within a table. DEFAULT(default): Each file group contains records of a certain set of keys, without particular grouping criteria. BUCKET: Each file group contains records of a set of keys which map to a certain range of hash values, so that using the hash function can easily identify the file group a record belongs to, based on the record key. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------- | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.storage.layout.partitioner.class](#hoodiestoragelayoutpartitionerclass) | N/A **(Required)** | Partitioner class, it is used to distribute data in a specific way.

`Config Param: LAYOUT_PARTITIONER_CLASS_NAME` | +| [hoodie.storage.layout.type](#hoodiestoragelayouttype) | DEFAULT (Optional) | org.apache.hudi.table.storage.HoodieStorageLayout$LayoutType: Determines how the files are organized within a table. DEFAULT(default): Each file group contains records of a certain set of keys, without particular grouping criteria. BUCKET: Each file group contains records of a set of keys which map to a certain range of hash values, so that using the hash function can easily identify the file group a record belongs to, based on the record key.

`Config Param: LAYOUT_TYPE` | --- @@ -735,110 +735,110 @@ Controls memory usage for compaction and merges, performed internally by Hudi. [**Advanced Configs**](#Memory-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------- | --------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.memory.compaction.max.size](#hoodiememorycompactionmaxsize) | N/A **(Required)** | Maximum amount of memory used in bytes for compaction operations in bytes , before spilling to local storage. | | -| [hoodie.memory.spillable.map.path](#hoodiememoryspillablemappath) | N/A **(Required)** | Default file path for spillable map | | -| [hoodie.memory.compaction.fraction](#hoodiememorycompactionfraction) | 0.6 (Optional) | HoodieCompactedLogScanner reads logblocks, converts records to HoodieRecords and then merges these log blocks and records. At any point, the number of entries in a log block can be less than or equal to the number of entries in the corresponding parquet file. This can lead to OOM in the Scanner. Hence, a spillable map helps alleviate the memory pressure. Use this config to set the max allowable inMemory footprint of the spillable map | | -| [hoodie.memory.dfs.buffer.max.size](#hoodiememorydfsbuffermaxsize) | 16777216 (Optional) | Property to control the max memory in bytes for dfs input stream buffer size | | -| [hoodie.memory.merge.fraction](#hoodiememorymergefraction) | 0.6 (Optional) | This fraction is multiplied with the user memory fraction (1 - spark.memory.fraction) to get a final fraction of heap space to use during merge | | -| [hoodie.memory.merge.max.size](#hoodiememorymergemaxsize) | 1073741824 (Optional) | Maximum amount of memory used in bytes for merge operations, before spilling to local storage. | | -| [hoodie.memory.writestatus.failure.fraction](#hoodiememorywritestatusfailurefraction) | 0.1 (Optional) | Property to control how what fraction of the failed record, exceptions we report back to driver. Default is 10%. If set to 100%, with lot of failures, this can cause memory pressure, cause OOMs and mask actual data errors. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.memory.compaction.max.size](#hoodiememorycompactionmaxsize) | N/A **(Required)** | Maximum amount of memory used in bytes for compaction operations in bytes , before spilling to local storage.

`Config Param: MAX_MEMORY_FOR_COMPACTION` | +| [hoodie.memory.spillable.map.path](#hoodiememoryspillablemappath) | N/A **(Required)** | Default file path for spillable map

`Config Param: SPILLABLE_MAP_BASE_PATH` | +| [hoodie.memory.compaction.fraction](#hoodiememorycompactionfraction) | 0.6 (Optional) | HoodieCompactedLogScanner reads logblocks, converts records to HoodieRecords and then merges these log blocks and records. At any point, the number of entries in a log block can be less than or equal to the number of entries in the corresponding parquet file. This can lead to OOM in the Scanner. Hence, a spillable map helps alleviate the memory pressure. Use this config to set the max allowable inMemory footprint of the spillable map

`Config Param: MAX_MEMORY_FRACTION_FOR_COMPACTION` | +| [hoodie.memory.dfs.buffer.max.size](#hoodiememorydfsbuffermaxsize) | 16777216 (Optional) | Property to control the max memory in bytes for dfs input stream buffer size

`Config Param: MAX_DFS_STREAM_BUFFER_SIZE` | +| [hoodie.memory.merge.fraction](#hoodiememorymergefraction) | 0.6 (Optional) | This fraction is multiplied with the user memory fraction (1 - spark.memory.fraction) to get a final fraction of heap space to use during merge

`Config Param: MAX_MEMORY_FRACTION_FOR_MERGE` | +| [hoodie.memory.merge.max.size](#hoodiememorymergemaxsize) | 1073741824 (Optional) | Maximum amount of memory used in bytes for merge operations, before spilling to local storage.

`Config Param: MAX_MEMORY_FOR_MERGE` | +| [hoodie.memory.writestatus.failure.fraction](#hoodiememorywritestatusfailurefraction) | 0.1 (Optional) | Property to control how what fraction of the failed record, exceptions we report back to driver. Default is 10%. If set to 100%, with lot of failures, this can cause memory pressure, cause OOMs and mask actual data errors.

`Config Param: WRITESTATUS_FAILURE_FRACTION` | --- ### Write Configurations {#Write-Configurations} -Configurations that control write behavior on Hudi tables. These can be directly passed down from even higher level frameworks (e.g Spark datasources, Flink sink) and utilities (e.g DeltaStreamer). +Configurations that control write behavior on Hudi tables. These can be directly passed down from even higher level frameworks (e.g Spark datasources, Flink sink) and utilities (e.g Hudi Streamer). [**Basic Configs**](#Write-Configurations-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------- | ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.base.path](#hoodiebasepath) | N/A **(Required)** | Base path on lake storage, under which all the table data is stored. Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs etc in .hoodie directory under this base path directory. | | -| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name that will be used for registering with metastores like HMS. Needs to be same across runs. | | -| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..) | | -| [hoodie.write.concurrency.mode](#hoodiewriteconcurrencymode) | SINGLE_WRITER (Optional) | org.apache.hudi.common.model.WriteConcurrencyMode: Concurrency modes for write operations. SINGLE_WRITER(default): Only one active writer to the table. Maximizes throughput. OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table with lazy conflict resolution using locks. This means that only one writer succeeds if multiple writers write to the same file group. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------------------- | ------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.base.path](#hoodiebasepath) | N/A **(Required)** | Base path on lake storage, under which all the table data is stored. Always prefix it explicitly with the storage scheme (e.g hdfs://, s3:// etc). Hudi stores all the main meta-data about commits, savepoints, cleaning audit logs etc in .hoodie directory under this base path directory.

`Config Param: BASE_PATH` | +| [hoodie.table.name](#hoodietablename) | N/A **(Required)** | Table name that will be used for registering with metastores like HMS. Needs to be same across runs.

`Config Param: TBL_NAME` | +| [hoodie.datasource.write.precombine.field](#hoodiedatasourcewriteprecombinefield) | ts (Optional) | Field used in preCombining before actual write. When two records have the same key value, we will pick the one with the largest value for the precombine field, determined by Object.compareTo(..)

`Config Param: PRECOMBINE_FIELD_NAME` | +| [hoodie.write.concurrency.mode](#hoodiewriteconcurrencymode) | SINGLE_WRITER (Optional) | org.apache.hudi.common.model.WriteConcurrencyMode: Concurrency modes for write operations. SINGLE_WRITER(default): Only one active writer to the table. Maximizes throughput. OPTIMISTIC_CONCURRENCY_CONTROL: Multiple writers can operate on the table with lazy conflict resolution using locks. This means that only one writer succeeds if multiple writers write to the same file group.

`Config Param: WRITE_CONCURRENCY_MODE` | [**Advanced Configs**](#Write-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------- | -| [hoodie.avro.schema](#hoodieavroschema) | N/A **(Required)** | Schema string representing the current write schema of the table. Hudi passes this to implementations of HoodieRecordPayload to convert incoming records to avro. This is also used as the write schema evolving records during an update. | | -| [hoodie.bulkinsert.user.defined.partitioner.class](#hoodiebulkinsertuserdefinedpartitionerclass) | N/A **(Required)** | If specified, this class will be used to re-partition records before they are bulk inserted. This can be used to sort, pack, cluster data optimally for common query patterns. For now we support a build-in user defined bulkinsert partitioner org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner which can does sorting based on specified column values set by hoodie.bulkinsert.user.defined.partitioner.sort.columns | | -| [hoodie.bulkinsert.user.defined.partitioner.sort.columns](#hoodiebulkinsertuserdefinedpartitionersortcolumns) | N/A **(Required)** | Columns to sort the data by when use org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner as user defined partitioner during bulk_insert. For example 'column1,column2' | | -| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | N/A **(Required)** | Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator` extract a key out of incoming records. | | -| [hoodie.internal.schema](#hoodieinternalschema) | N/A **(Required)** | Schema string representing the latest schema of the table. Hudi passes this to implementations of evolution of schema | | -| [hoodie.write.schema](#hoodiewriteschema) | N/A **(Required)** | Config allowing to override writer's schema. This might be necessary in cases when writer's schema derived from the incoming dataset might actually be different from the schema we actually want to use when writing. This, for ex, could be the case for'partial-update' use-cases (like `MERGE INTO` Spark SQL statement for ex) where only a projection of the incoming dataset might be used to update the records in the existing table, prompting us to override the writer's schema | | -| [_.hoodie.allow.multi.write.on.same.instant](#_hoodieallowmultiwriteonsameinstant) | false (Optional) | | | -| [hoodie.allow.empty.commit](#hoodieallowemptycommit) | true (Optional) | Whether to allow generation of empty commits, even if no data was written in the commit. It's useful in cases where extra metadata needs to be published regardless e.g tracking source offsets when ingesting data | | -| [hoodie.allow.operation.metadata.field](#hoodieallowoperationmetadatafield) | false (Optional) | Whether to include '_hoodie_operation' in the metadata fields. Once enabled, all the changes of a record are persisted to the delta log directly without merge | 0.9.0 | -| [hoodie.auto.adjust.lock.configs](#hoodieautoadjustlockconfigs) | false (Optional) | Auto adjust lock configurations when metadata table is enabled and for async table services. | 0.11.0 | -| [hoodie.auto.commit](#hoodieautocommit) | true (Optional) | Controls whether a write operation should auto commit. This can be turned off to perform inspection of the uncommitted write before deciding to commit. | | -| [hoodie.avro.schema.external.transformation](#hoodieavroschemaexternaltransformation) | false (Optional) | When enabled, records in older schema are rewritten into newer schema during upsert,delete and background compaction,clustering operations. | | -| [hoodie.avro.schema.validate](#hoodieavroschemavalidate) | false (Optional) | Validate the schema used for the write against the latest schema, for backwards compatibility. | | -| [hoodie.bulkinsert.shuffle.parallelism](#hoodiebulkinsertshuffleparallelism) | 0 (Optional) | For large initial imports using bulk_insert operation, controls the parallelism to use for sort modes or custom partitioning done before writing records to the table. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data or the parallelism based on the logical plan for row writer. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the bulk insert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB. | | -| [hoodie.bulkinsert.sort.mode](#hoodiebulkinsertsortmode) | NONE (Optional) | org.apache.hudi.execution.bulkinsert.BulkInsertSortMode: Modes for sorting records during bulk insert. NONE(default): No sorting. Fastest and matches `spark.write.parquet()` in number of files and overhead. GLOBAL_SORT: This ensures best file sizes, with lowest memory overhead at cost of sorting. PARTITION_SORT: Strikes a balance by only sorting within a Spark RDD partition, still keeping the memory overhead of writing low. File sizing is not as good as GLOBAL_SORT. PARTITION_PATH_REPARTITION: This ensures that the data for a single physical partition in the table is written by the same Spark executor. This should only be used when input data is evenly distributed across different partition paths. If data is skewed (most records are intended for a handful of partition paths among all) then this can cause an imbalance among Spark executors. PARTITION_PATH_REPARTITION_AND_SORT: This ensures that the data for a single physical partition in the table is written by the same Spark executor. This should only be used when input data is evenly distributed across different partition paths. Compared to PARTITION_PATH_REPARTITION, this sort mode does an additional step of sorting the records based on the partition path within a single Spark partition, given that data for multiple physical partitions can be sent to the same Spark partition and executor. If data is skewed (most records are intended for a handful of partition paths among all) then this can cause an imbalance among Spark executors. | | -| [hoodie.client.heartbeat.interval_in_ms](#hoodieclientheartbeatinterval_in_ms) | 60000 (Optional) | Writers perform heartbeats to indicate liveness. Controls how often (in ms), such heartbeats are registered to lake storage. | | -| [hoodie.client.heartbeat.tolerable.misses](#hoodieclientheartbeattolerablemisses) | 2 (Optional) | Number of heartbeat misses, before a writer is deemed not alive and all pending writes are aborted. | | -| [hoodie.client.init.callback.classes](#hoodieclientinitcallbackclasses) | (Optional) | Fully-qualified class names of the Hudi client init callbacks to run at the initialization of the Hudi client. The class names are separated by `,`. The class must be a subclass of `org.apache.hudi.callback.HoodieClientInitCallback`.By default, no Hudi client init callback is executed. | 0.14.0 | -| [hoodie.combine.before.delete](#hoodiecombinebeforedelete) | true (Optional) | During delete operations, controls whether we should combine deletes (and potentially also upserts) before writing to storage. | | -| [hoodie.combine.before.insert](#hoodiecombinebeforeinsert) | false (Optional) | When inserted records share same key, controls whether they should be first combined (i.e de-duplicated) before writing to storage. | | -| [hoodie.combine.before.upsert](#hoodiecombinebeforeupsert) | true (Optional) | When upserted records share same key, controls whether they should be first combined (i.e de-duplicated) before writing to storage. This should be turned off only if you are absolutely certain that there are no duplicates incoming, otherwise it can lead to duplicate keys and violate the uniqueness guarantees. | | -| [hoodie.consistency.check.initial_interval_ms](#hoodieconsistencycheckinitial_interval_ms) | 2000 (Optional) | Initial time between successive attempts to ensure written data's metadata is consistent on storage. Grows with exponential backoff after the initial value. | | -| [hoodie.consistency.check.max_checks](#hoodieconsistencycheckmax_checks) | 7 (Optional) | Maximum number of checks, for consistency of written data. | | -| [hoodie.consistency.check.max_interval_ms](#hoodieconsistencycheckmax_interval_ms) | 300000 (Optional) | Max time to wait between successive attempts at performing consistency checks | | -| [hoodie.datasource.write.keygenerator.type](#hoodiedatasourcewritekeygeneratortype) | SIMPLE (Optional) | **Note** This is being actively worked on. Please use `hoodie.datasource.write.keygenerator.class` instead. org.apache.hudi.keygen.constant.KeyGeneratorType: Key generator type, indicating the key generator class to use, that implements `org.apache.hudi.keygen.KeyGenerator`. SIMPLE(default): Simple key generator, which takes names of fields to be used for recordKey and partitionPath as configs. COMPLEX: Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs. TIMESTAMP: Timestamp-based key generator, that relies on timestamps for partitioning field. Still picks record key by name. CUSTOM: This is a generic implementation type of KeyGenerator where users can configure record key as a single field or a combination of fields. Similarly partition path can be configured to have multiple fields or only one field. This KeyGenerator expects value for prop "hoodie.datasource.write.partitionpath.field" in a specific format. For example: properties.put("hoodie.datasource.write.partitionpath.field", "field1:PartitionKeyType1,field2:PartitionKeyType2"). NON_PARTITION: Simple Key generator for non-partitioned tables. GLOBAL_DELETE: Key generator for deletes using global indices. | | -| [hoodie.datasource.write.payload.class](#hoodiedatasourcewritepayloadclass) | org.apache.hudi.common.model.OverwriteWithLatestAvroPayload (Optional) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective | | -| [hoodie.datasource.write.record.merger.impls](#hoodiedatasourcewriterecordmergerimpls) | org.apache.hudi.common.model.HoodieAvroRecordMerger (Optional) | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These merger impls will filter by hoodie.datasource.write.record.merger.strategy Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc) | 0.13.0 | -| [hoodie.datasource.write.record.merger.strategy](#hoodiedatasourcewriterecordmergerstrategy) | eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 (Optional) | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in hoodie.datasource.write.record.merger.impls which has the same merger strategy id | 0.13.0 | -| [hoodie.datasource.write.schema.allow.auto.evolution.column.drop](#hoodiedatasourcewriteschemaallowautoevolutioncolumndrop) | false (Optional) | Controls whether table's schema is allowed to automatically evolve when incoming batch's schema can have any of the columns dropped. By default, Hudi will not allow this kind of (auto) schema evolution. Set this config to true to allow table's schema to be updated automatically when columns are dropped from the new incoming batch. | 0.13.0 | -| [hoodie.delete.shuffle.parallelism](#hoodiedeleteshuffleparallelism) | 0 (Optional) | Parallelism used for delete operation. Delete operations also performs shuffles, similar to upsert operation. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. | | -| [hoodie.embed.timeline.server](#hoodieembedtimelineserver) | true (Optional) | When true, spins up an instance of the timeline server (meta server that serves cached file listings, statistics),running on each writer's driver process, accepting requests during the write from executors. | | -| [hoodie.embed.timeline.server.async](#hoodieembedtimelineserverasync) | false (Optional) | Controls whether or not, the requests to the timeline server are processed in asynchronous fashion, potentially improving throughput. | | -| [hoodie.embed.timeline.server.gzip](#hoodieembedtimelineservergzip) | true (Optional) | Controls whether gzip compression is used, for large responses from the timeline server, to improve latency. | | -| [hoodie.embed.timeline.server.port](#hoodieembedtimelineserverport) | 0 (Optional) | Port at which the timeline server listens for requests. When running embedded in each writer, it picks a free port and communicates to all the executors. This should rarely be changed. | | -| [hoodie.embed.timeline.server.reuse.enabled](#hoodieembedtimelineserverreuseenabled) | false (Optional) | Controls whether the timeline server instance should be cached and reused across the JVM (across task lifecycles)to avoid startup costs. This should rarely be changed. | | -| [hoodie.embed.timeline.server.threads](#hoodieembedtimelineserverthreads) | -1 (Optional) | Number of threads to serve requests in the timeline server. By default, auto configured based on the number of underlying cores. | | -| [hoodie.fail.on.timeline.archiving](#hoodiefailontimelinearchiving) | true (Optional) | Timeline archiving removes older instants from the timeline, after each write operation, to minimize metadata overhead. Controls whether or not, the write should be failed as well, if such archiving fails. | | -| [hoodie.fail.writes.on.inline.table.service.exception](#hoodiefailwritesoninlinetableserviceexception) | true (Optional) | Table services such as compaction and clustering can fail and prevent syncing to the metaclient. Set this to true to fail writes when table services fail | 0.13.0 | -| [hoodie.fileid.prefix.provider.class](#hoodiefileidprefixproviderclass) | org.apache.hudi.table.RandomFileIdPrefixProvider (Optional) | File Id Prefix provider class, that implements `org.apache.hudi.fileid.FileIdPrefixProvider` | 0.10.0 | -| [hoodie.finalize.write.parallelism](#hoodiefinalizewriteparallelism) | 200 (Optional) | Parallelism for the write finalization internal operation, which involves removing any partially written files from lake storage, before committing the write. Reduce this value, if the high number of tasks incur delays for smaller tables or low latency writes. | | -| [hoodie.insert.shuffle.parallelism](#hoodieinsertshuffleparallelism) | 0 (Optional) | Parallelism for inserting records into the table. Inserts can shuffle data before writing to tune file sizes and optimize the storage layout. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the insert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB. | | -| [hoodie.markers.delete.parallelism](#hoodiemarkersdeleteparallelism) | 100 (Optional) | Determines the parallelism for deleting marker files, which are used to track all files (valid or invalid/partial) written during a write operation. Increase this value if delays are observed, with large batch writes. | | -| [hoodie.markers.timeline_server_based.batch.interval_ms](#hoodiemarkerstimeline_server_basedbatchinterval_ms) | 50 (Optional) | The batch interval in milliseconds for marker creation batch processing | 0.9.0 | -| [hoodie.markers.timeline_server_based.batch.num_threads](#hoodiemarkerstimeline_server_basedbatchnum_threads) | 20 (Optional) | Number of threads to use for batch processing marker creation requests at the timeline server | 0.9.0 | -| [hoodie.merge.allow.duplicate.on.inserts](#hoodiemergeallowduplicateoninserts) | false (Optional) | When enabled, we allow duplicate keys even if inserts are routed to merge with an existing file (for ensuring file sizing). This is only relevant for insert operation, since upsert, delete operations will ensure unique key constraints are maintained. | | -| [hoodie.merge.data.validation.enabled](#hoodiemergedatavalidationenabled) | false (Optional) | When enabled, data validation checks are performed during merges to ensure expected number of records after merge operation. | | -| [hoodie.merge.small.file.group.candidates.limit](#hoodiemergesmallfilegroupcandidateslimit) | 1 (Optional) | Limits number of file groups, whose base file satisfies small-file limit, to consider for appending records during upsert operation. Only applicable to MOR tables | | -| [hoodie.release.resource.on.completion.enable](#hoodiereleaseresourceoncompletionenable) | true (Optional) | Control to enable release all persist rdds when the spark job finish. | 0.11.0 | -| [hoodie.rollback.instant.backup.dir](#hoodierollbackinstantbackupdir) | .rollback_backup (Optional) | Path where instants being rolled back are copied. If not absolute path then a directory relative to .hoodie folder is created. | | -| [hoodie.rollback.instant.backup.enabled](#hoodierollbackinstantbackupenabled) | false (Optional) | Backup instants removed during rollback and restore (useful for debugging) | | -| [hoodie.rollback.parallelism](#hoodierollbackparallelism) | 100 (Optional) | This config controls the parallelism for rollback of commits. Rollbacks perform deletion of files or logging delete blocks to file groups on storage in parallel. The configure value limits the parallelism so that the number of Spark tasks do not exceed the value. If rollback is slow due to the limited parallelism, you can increase this to tune the performance. | | -| [hoodie.rollback.using.markers](#hoodierollbackusingmarkers) | true (Optional) | Enables a more efficient mechanism for rollbacks based on the marker files generated during the writes. Turned on by default. | | -| [hoodie.schema.cache.enable](#hoodieschemacacheenable) | false (Optional) | cache query internalSchemas in driver/executor side | | -| [hoodie.sensitive.config.keys](#hoodiesensitiveconfigkeys) | ssl,tls,sasl,auth,credentials (Optional) | Comma separated list of filters for sensitive config keys. Delta Streamer will not print any configuration which contains the configured filter. For example with a configured filter `ssl`, value for config `ssl.trustore.location` would be masked. | | -| [hoodie.skip.default.partition.validation](#hoodieskipdefaultpartitionvalidation) | false (Optional) | When table is upgraded from pre 0.12 to 0.12, we check for "default" partition and fail if found one. Users are expected to rewrite the data in those partitions. Enabling this config will bypass this validation | 0.12.0 | -| [hoodie.table.base.file.format](#hoodietablebasefileformat) | PARQUET (Optional) | File format to store all the base file data. org.apache.hudi.common.model.HoodieFileFormat: Hoodie file formats. PARQUET(default): Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides efficient data compression and encoding schemes with enhanced performance to handle complex data in bulk. HFILE: (internal config) File format for metadata table. A file of sorted key/value pairs. Both keys and values are byte arrays. ORC: The Optimized Row Columnar (ORC) file format provides a highly efficient way to store Hive data. It was designed to overcome limitations of the other Hive file formats. Using ORC files improves performance when Hive is reading, writing, and processing data. | | -| [hoodie.table.services.enabled](#hoodietableservicesenabled) | true (Optional) | Master control to disable all table services including archive, clean, compact, cluster, etc. | 0.11.0 | -| [hoodie.timeline.layout.version](#hoodietimelinelayoutversion) | 1 (Optional) | Controls the layout of the timeline. Version 0 relied on renames, Version 1 (default) models the timeline as an immutable log relying only on atomic writes for object storage. | 0.5.1 | -| [hoodie.upsert.shuffle.parallelism](#hoodieupsertshuffleparallelism) | 0 (Optional) | Parallelism to use for upsert operation on the table. Upserts can shuffle data to perform index lookups, file sizing, bin packing records optimally into file groups. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the upsert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB. | | -| [hoodie.write.buffer.limit.bytes](#hoodiewritebufferlimitbytes) | 4194304 (Optional) | Size of in-memory buffer used for parallelizing network reads and lake storage writes. | | -| [hoodie.write.concurrency.async.conflict.detector.initial_delay_ms](#hoodiewriteconcurrencyasyncconflictdetectorinitial_delay_ms) | 0 (Optional) | Used for timeline-server-based markers with `AsyncTimelineServerBasedDetectionStrategy`. The time in milliseconds to delay the first execution of async marker-based conflict detection. | 0.13.0 | -| [hoodie.write.concurrency.async.conflict.detector.period_ms](#hoodiewriteconcurrencyasyncconflictdetectorperiod_ms) | 30000 (Optional) | Used for timeline-server-based markers with `AsyncTimelineServerBasedDetectionStrategy`. The period in milliseconds between successive executions of async marker-based conflict detection. | 0.13.0 | -| [hoodie.write.concurrency.early.conflict.check.commit.conflict](#hoodiewriteconcurrencyearlyconflictcheckcommitconflict) | false (Optional) | Whether to enable commit conflict checking or not during early conflict detection. | 0.13.0 | -| [hoodie.write.concurrency.early.conflict.detection.enable](#hoodiewriteconcurrencyearlyconflictdetectionenable) | false (Optional) | Whether to enable early conflict detection based on markers. It eagerly detects writing conflict before create markers and fails fast if a conflict is detected, to release cluster compute resources as soon as possible. | 0.13.0 | -| [hoodie.write.concurrency.early.conflict.detection.strategy](#hoodiewriteconcurrencyearlyconflictdetectionstrategy) | (Optional) | The class name of the early conflict detection strategy to use. This should be a subclass of `org.apache.hudi.common.conflict.detection.EarlyConflictDetectionStrategy`. | 0.13.0 | -| [hoodie.write.executor.disruptor.buffer.limit.bytes](#hoodiewriteexecutordisruptorbufferlimitbytes) | 1024 (Optional) | The size of the Disruptor Executor ring buffer, must be power of 2 | 0.13.0 | -| [hoodie.write.executor.disruptor.wait.strategy](#hoodiewriteexecutordisruptorwaitstrategy) | BLOCKING_WAIT (Optional) | org.apache.hudi.common.util.queue.DisruptorWaitStrategyType: Strategy employed for making Disruptor Executor wait on a cursor. BLOCKING_WAIT(default): The slowest of the available wait strategies. However, it is the most conservative with the respect to CPU usage and will give the most consistent behaviour across the widest variety of deployment options. SLEEPING_WAIT: Like the `BLOCKING_WAIT` strategy, it attempts to be conservative with CPU usage by using a simple busy wait loop. The difference is that the `SLEEPING_WAIT` strategy uses a call to `LockSupport.parkNanos(1)` in the middle of the loop. On a typical Linux system this will pause the thread for around 60µs. YIELDING_WAIT: The `YIELDING_WAIT` strategy is one of two wait strategy that can be used in low-latency systems. It is designed for cases where there is an opportunity to burn CPU cycles with the goal of improving latency. The `YIELDING_WAIT` strategy will busy spin, waiting for the sequence to increment to the appropriate value. Inside the body of the loop `Thread#yield()` will be called allowing other queued threads to run. This is the recommended wait strategy when you need very high performance, and the number of `EventHandler` threads is lower than the total number of logical cores, such as when hyper-threading is enabled. BUSY_SPIN_WAIT: The `BUSY_SPIN_WAIT` strategy is the highest performing wait strategy. Like the `YIELDING_WAIT` strategy, it can be used in low-latency systems, but puts the highest constraints on the deployment environment. | 0.13.0 | -| [hoodie.write.executor.type](#hoodiewriteexecutortype) | SIMPLE (Optional) | org.apache.hudi.common.util.queue.ExecutorType: Types of executor that implements org.apache.hudi.common.util.queue.HoodieExecutor. The executor orchestrates concurrent producers and consumers communicating through a message queue. BOUNDED_IN_MEMORY: Executor which orchestrates concurrent producers and consumers communicating through a bounded in-memory message queue using LinkedBlockingQueue. This queue will use extra lock to balance producers and consumers. DISRUPTOR: Executor which orchestrates concurrent producers and consumers communicating through disruptor as a lock free message queue to gain better writing performance. Although DisruptorExecutor is still an experimental feature. SIMPLE(default): Executor with no inner message queue and no inner lock. Consuming and writing records from iterator directly. The advantage is that there is no need for additional memory and cpu resources due to lock or multithreading. The disadvantage is that the executor is a single-write-single-read model, cannot support functions such as speed limit and can not de-couple the network read (shuffle read) and network write (writing objects/files to storage) anymore. | 0.13.0 | -| [hoodie.write.markers.type](#hoodiewritemarkerstype) | TIMELINE_SERVER_BASED (Optional) | org.apache.hudi.common.table.marker.MarkerType: Marker type indicating how markers are stored in the file system, used for identifying the files written and cleaning up files not committed which should be deleted. DIRECT: Individual marker file corresponding to each data file is directly created by the writer. TIMELINE_SERVER_BASED(default): Marker operations are all handled at the timeline service which serves as a proxy. New marker entries are batch processed and stored in a limited number of underlying files for efficiency. If HDFS is used or timeline server is disabled, DIRECT markers are used as fallback even if this is configured. This configuration does not take effect for Spark structured streaming; DIRECT markers are always used. | 0.9.0 | -| [hoodie.write.status.storage.level](#hoodiewritestatusstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Write status objects hold metadata about a write (stats, errors), that is not yet committed to storage. This controls the how that information is cached for inspection by clients. We rarely expect this to be changed. | | -| [hoodie.write.tagged.record.storage.level](#hoodiewritetaggedrecordstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Determine what level of persistence is used to cache write RDDs. Refer to org.apache.spark.storage.StorageLevel for different values | | -| [hoodie.writestatus.class](#hoodiewritestatusclass) | org.apache.hudi.client.WriteStatus (Optional) | Subclass of org.apache.hudi.client.WriteStatus to be used to collect information about a write. Can be overridden to collection additional metrics/statistics about the data if needed. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.avro.schema](#hoodieavroschema) | N/A **(Required)** | Schema string representing the current write schema of the table. Hudi passes this to implementations of HoodieRecordPayload to convert incoming records to avro. This is also used as the write schema evolving records during an update.

`Config Param: AVRO_SCHEMA_STRING` | +| [hoodie.bulkinsert.user.defined.partitioner.class](#hoodiebulkinsertuserdefinedpartitionerclass) | N/A **(Required)** | If specified, this class will be used to re-partition records before they are bulk inserted. This can be used to sort, pack, cluster data optimally for common query patterns. For now we support a build-in user defined bulkinsert partitioner org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner which can does sorting based on specified column values set by hoodie.bulkinsert.user.defined.partitioner.sort.columns

`Config Param: BULKINSERT_USER_DEFINED_PARTITIONER_CLASS_NAME` | +| [hoodie.bulkinsert.user.defined.partitioner.sort.columns](#hoodiebulkinsertuserdefinedpartitionersortcolumns) | N/A **(Required)** | Columns to sort the data by when use org.apache.hudi.execution.bulkinsert.RDDCustomColumnsSortPartitioner as user defined partitioner during bulk_insert. For example 'column1,column2'

`Config Param: BULKINSERT_USER_DEFINED_PARTITIONER_SORT_COLUMNS` | +| [hoodie.datasource.write.keygenerator.class](#hoodiedatasourcewritekeygeneratorclass) | N/A **(Required)** | Key generator class, that implements `org.apache.hudi.keygen.KeyGenerator` extract a key out of incoming records.

`Config Param: KEYGENERATOR_CLASS_NAME` | +| [hoodie.internal.schema](#hoodieinternalschema) | N/A **(Required)** | Schema string representing the latest schema of the table. Hudi passes this to implementations of evolution of schema

`Config Param: INTERNAL_SCHEMA_STRING` | +| [hoodie.write.schema](#hoodiewriteschema) | N/A **(Required)** | Config allowing to override writer's schema. This might be necessary in cases when writer's schema derived from the incoming dataset might actually be different from the schema we actually want to use when writing. This, for ex, could be the case for'partial-update' use-cases (like `MERGE INTO` Spark SQL statement for ex) where only a projection of the incoming dataset might be used to update the records in the existing table, prompting us to override the writer's schema

`Config Param: WRITE_SCHEMA_OVERRIDE` | +| [_.hoodie.allow.multi.write.on.same.instant](#_hoodieallowmultiwriteonsameinstant) | false (Optional) |

`Config Param: ALLOW_MULTI_WRITE_ON_SAME_INSTANT_ENABLE` | +| [hoodie.allow.empty.commit](#hoodieallowemptycommit) | true (Optional) | Whether to allow generation of empty commits, even if no data was written in the commit. It's useful in cases where extra metadata needs to be published regardless e.g tracking source offsets when ingesting data

`Config Param: ALLOW_EMPTY_COMMIT` | +| [hoodie.allow.operation.metadata.field](#hoodieallowoperationmetadatafield) | false (Optional) | Whether to include '_hoodie_operation' in the metadata fields. Once enabled, all the changes of a record are persisted to the delta log directly without merge

`Config Param: ALLOW_OPERATION_METADATA_FIELD`
`Since Version: 0.9.0` | +| [hoodie.auto.adjust.lock.configs](#hoodieautoadjustlockconfigs) | false (Optional) | Auto adjust lock configurations when metadata table is enabled and for async table services.

`Config Param: AUTO_ADJUST_LOCK_CONFIGS`
`Since Version: 0.11.0` | +| [hoodie.auto.commit](#hoodieautocommit) | true (Optional) | Controls whether a write operation should auto commit. This can be turned off to perform inspection of the uncommitted write before deciding to commit.

`Config Param: AUTO_COMMIT_ENABLE` | +| [hoodie.avro.schema.external.transformation](#hoodieavroschemaexternaltransformation) | false (Optional) | When enabled, records in older schema are rewritten into newer schema during upsert,delete and background compaction,clustering operations.

`Config Param: AVRO_EXTERNAL_SCHEMA_TRANSFORMATION_ENABLE` | +| [hoodie.avro.schema.validate](#hoodieavroschemavalidate) | false (Optional) | Validate the schema used for the write against the latest schema, for backwards compatibility.

`Config Param: AVRO_SCHEMA_VALIDATE_ENABLE` | +| [hoodie.bulkinsert.shuffle.parallelism](#hoodiebulkinsertshuffleparallelism) | 0 (Optional) | For large initial imports using bulk_insert operation, controls the parallelism to use for sort modes or custom partitioning done before writing records to the table. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data or the parallelism based on the logical plan for row writer. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the bulk insert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB.

`Config Param: BULKINSERT_PARALLELISM_VALUE` | +| [hoodie.bulkinsert.sort.mode](#hoodiebulkinsertsortmode) | NONE (Optional) | org.apache.hudi.execution.bulkinsert.BulkInsertSortMode: Modes for sorting records during bulk insert. NONE(default): No sorting. Fastest and matches `spark.write.parquet()` in number of files and overhead. GLOBAL_SORT: This ensures best file sizes, with lowest memory overhead at cost of sorting. PARTITION_SORT: Strikes a balance by only sorting within a Spark RDD partition, still keeping the memory overhead of writing low. File sizing is not as good as GLOBAL_SORT. PARTITION_PATH_REPARTITION: This ensures that the data for a single physical partition in the table is written by the same Spark executor. This should only be used when input data is evenly distributed across different partition paths. If data is skewed (most records are intended for a handful of partition paths among all) then this can cause an imbalance among Spark executors. PARTITION_PATH_REPARTITION_AND_SORT: This ensures that the data for a single physical partition in the table is written by the same Spark executor. This should only be used when input data is evenly distributed across different partition paths. Compared to PARTITION_PATH_REPARTITION, this sort mode does an additional step of sorting the records based on the partition path within a single Spark partition, given that data for multiple physical partitions can be sent to the same Spark partition and executor. If data is skewed (most records are intended for a handful of partition paths among all) then this can cause an imbalance among Spark executors.

`Config Param: BULK_INSERT_SORT_MODE` | +| [hoodie.client.heartbeat.interval_in_ms](#hoodieclientheartbeatinterval_in_ms) | 60000 (Optional) | Writers perform heartbeats to indicate liveness. Controls how often (in ms), such heartbeats are registered to lake storage.

`Config Param: CLIENT_HEARTBEAT_INTERVAL_IN_MS` | +| [hoodie.client.heartbeat.tolerable.misses](#hoodieclientheartbeattolerablemisses) | 2 (Optional) | Number of heartbeat misses, before a writer is deemed not alive and all pending writes are aborted.

`Config Param: CLIENT_HEARTBEAT_NUM_TOLERABLE_MISSES` | +| [hoodie.client.init.callback.classes](#hoodieclientinitcallbackclasses) | (Optional) | Fully-qualified class names of the Hudi client init callbacks to run at the initialization of the Hudi client. The class names are separated by `,`. The class must be a subclass of `org.apache.hudi.callback.HoodieClientInitCallback`.By default, no Hudi client init callback is executed.

`Config Param: CLIENT_INIT_CALLBACK_CLASS_NAMES`
`Since Version: 0.14.0` | +| [hoodie.combine.before.delete](#hoodiecombinebeforedelete) | true (Optional) | During delete operations, controls whether we should combine deletes (and potentially also upserts) before writing to storage.

`Config Param: COMBINE_BEFORE_DELETE` | +| [hoodie.combine.before.insert](#hoodiecombinebeforeinsert) | false (Optional) | When inserted records share same key, controls whether they should be first combined (i.e de-duplicated) before writing to storage.

`Config Param: COMBINE_BEFORE_INSERT` | +| [hoodie.combine.before.upsert](#hoodiecombinebeforeupsert) | true (Optional) | When upserted records share same key, controls whether they should be first combined (i.e de-duplicated) before writing to storage. This should be turned off only if you are absolutely certain that there are no duplicates incoming, otherwise it can lead to duplicate keys and violate the uniqueness guarantees.

`Config Param: COMBINE_BEFORE_UPSERT` | +| [hoodie.consistency.check.initial_interval_ms](#hoodieconsistencycheckinitial_interval_ms) | 2000 (Optional) | Initial time between successive attempts to ensure written data's metadata is consistent on storage. Grows with exponential backoff after the initial value.

`Config Param: INITIAL_CONSISTENCY_CHECK_INTERVAL_MS` | +| [hoodie.consistency.check.max_checks](#hoodieconsistencycheckmax_checks) | 7 (Optional) | Maximum number of checks, for consistency of written data.

`Config Param: MAX_CONSISTENCY_CHECKS` | +| [hoodie.consistency.check.max_interval_ms](#hoodieconsistencycheckmax_interval_ms) | 300000 (Optional) | Max time to wait between successive attempts at performing consistency checks

`Config Param: MAX_CONSISTENCY_CHECK_INTERVAL_MS` | +| [hoodie.datasource.write.keygenerator.type](#hoodiedatasourcewritekeygeneratortype) | SIMPLE (Optional) | **Note** This is being actively worked on. Please use `hoodie.datasource.write.keygenerator.class` instead. org.apache.hudi.keygen.constant.KeyGeneratorType: Key generator type, indicating the key generator class to use, that implements `org.apache.hudi.keygen.KeyGenerator`. SIMPLE(default): Simple key generator, which takes names of fields to be used for recordKey and partitionPath as configs. COMPLEX: Complex key generator, which takes names of fields to be used for recordKey and partitionPath as configs. TIMESTAMP: Timestamp-based key generator, that relies on timestamps for partitioning field. Still picks record key by name. CUSTOM: This is a generic implementation type of KeyGenerator where users can configure record key as a single field or a combination of fields. Similarly partition path can be configured to have multiple fields or only one field. This KeyGenerator expects value for prop "hoodie.datasource.write.partitionpath.field" in a specific format. For example: properties.put("hoodie.datasource.write.partitionpath.field", "field1:PartitionKeyType1,field2:PartitionKeyType2"). NON_PARTITION: Simple Key generator for non-partitioned tables. GLOBAL_DELETE: Key generator for deletes using global indices.

`Config Param: KEYGENERATOR_TYPE` | +| [hoodie.datasource.write.payload.class](#hoodiedatasourcewritepayloadclass) | org.apache.hudi.common.model.OverwriteWithLatestAvroPayload (Optional) | Payload class used. Override this, if you like to roll your own merge logic, when upserting/inserting. This will render any value set for PRECOMBINE_FIELD_OPT_VAL in-effective

`Config Param: WRITE_PAYLOAD_CLASS_NAME` | +| [hoodie.datasource.write.record.merger.impls](#hoodiedatasourcewriterecordmergerimpls) | org.apache.hudi.common.model.HoodieAvroRecordMerger (Optional) | List of HoodieMerger implementations constituting Hudi's merging strategy -- based on the engine used. These merger impls will filter by hoodie.datasource.write.record.merger.strategy Hudi will pick most efficient implementation to perform merging/combining of the records (during update, reading MOR table, etc)

`Config Param: RECORD_MERGER_IMPLS`
`Since Version: 0.13.0` | +| [hoodie.datasource.write.record.merger.strategy](#hoodiedatasourcewriterecordmergerstrategy) | eeb8d96f-b1e4-49fd-bbf8-28ac514178e5 (Optional) | Id of merger strategy. Hudi will pick HoodieRecordMerger implementations in hoodie.datasource.write.record.merger.impls which has the same merger strategy id

`Config Param: RECORD_MERGER_STRATEGY`
`Since Version: 0.13.0` | +| [hoodie.datasource.write.schema.allow.auto.evolution.column.drop](#hoodiedatasourcewriteschemaallowautoevolutioncolumndrop) | false (Optional) | Controls whether table's schema is allowed to automatically evolve when incoming batch's schema can have any of the columns dropped. By default, Hudi will not allow this kind of (auto) schema evolution. Set this config to true to allow table's schema to be updated automatically when columns are dropped from the new incoming batch.

`Config Param: SCHEMA_ALLOW_AUTO_EVOLUTION_COLUMN_DROP`
`Since Version: 0.13.0` | +| [hoodie.delete.shuffle.parallelism](#hoodiedeleteshuffleparallelism) | 0 (Optional) | Parallelism used for delete operation. Delete operations also performs shuffles, similar to upsert operation. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism.

`Config Param: DELETE_PARALLELISM_VALUE` | +| [hoodie.embed.timeline.server](#hoodieembedtimelineserver) | true (Optional) | When true, spins up an instance of the timeline server (meta server that serves cached file listings, statistics),running on each writer's driver process, accepting requests during the write from executors.

`Config Param: EMBEDDED_TIMELINE_SERVER_ENABLE` | +| [hoodie.embed.timeline.server.async](#hoodieembedtimelineserverasync) | false (Optional) | Controls whether or not, the requests to the timeline server are processed in asynchronous fashion, potentially improving throughput.

`Config Param: EMBEDDED_TIMELINE_SERVER_USE_ASYNC_ENABLE` | +| [hoodie.embed.timeline.server.gzip](#hoodieembedtimelineservergzip) | true (Optional) | Controls whether gzip compression is used, for large responses from the timeline server, to improve latency.

`Config Param: EMBEDDED_TIMELINE_SERVER_COMPRESS_ENABLE` | +| [hoodie.embed.timeline.server.port](#hoodieembedtimelineserverport) | 0 (Optional) | Port at which the timeline server listens for requests. When running embedded in each writer, it picks a free port and communicates to all the executors. This should rarely be changed.

`Config Param: EMBEDDED_TIMELINE_SERVER_PORT_NUM` | +| [hoodie.embed.timeline.server.reuse.enabled](#hoodieembedtimelineserverreuseenabled) | false (Optional) | Controls whether the timeline server instance should be cached and reused across the JVM (across task lifecycles)to avoid startup costs. This should rarely be changed.

`Config Param: EMBEDDED_TIMELINE_SERVER_REUSE_ENABLED` | +| [hoodie.embed.timeline.server.threads](#hoodieembedtimelineserverthreads) | -1 (Optional) | Number of threads to serve requests in the timeline server. By default, auto configured based on the number of underlying cores.

`Config Param: EMBEDDED_TIMELINE_NUM_SERVER_THREADS` | +| [hoodie.fail.on.timeline.archiving](#hoodiefailontimelinearchiving) | true (Optional) | Timeline archiving removes older instants from the timeline, after each write operation, to minimize metadata overhead. Controls whether or not, the write should be failed as well, if such archiving fails.

`Config Param: FAIL_ON_TIMELINE_ARCHIVING_ENABLE` | +| [hoodie.fail.writes.on.inline.table.service.exception](#hoodiefailwritesoninlinetableserviceexception) | true (Optional) | Table services such as compaction and clustering can fail and prevent syncing to the metaclient. Set this to true to fail writes when table services fail

`Config Param: FAIL_ON_INLINE_TABLE_SERVICE_EXCEPTION`
`Since Version: 0.13.0` | +| [hoodie.fileid.prefix.provider.class](#hoodiefileidprefixproviderclass) | org.apache.hudi.table.RandomFileIdPrefixProvider (Optional) | File Id Prefix provider class, that implements `org.apache.hudi.fileid.FileIdPrefixProvider`

`Config Param: FILEID_PREFIX_PROVIDER_CLASS`
`Since Version: 0.10.0` | +| [hoodie.finalize.write.parallelism](#hoodiefinalizewriteparallelism) | 200 (Optional) | Parallelism for the write finalization internal operation, which involves removing any partially written files from lake storage, before committing the write. Reduce this value, if the high number of tasks incur delays for smaller tables or low latency writes.

`Config Param: FINALIZE_WRITE_PARALLELISM_VALUE` | +| [hoodie.insert.shuffle.parallelism](#hoodieinsertshuffleparallelism) | 0 (Optional) | Parallelism for inserting records into the table. Inserts can shuffle data before writing to tune file sizes and optimize the storage layout. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the insert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB.

`Config Param: INSERT_PARALLELISM_VALUE` | +| [hoodie.markers.delete.parallelism](#hoodiemarkersdeleteparallelism) | 100 (Optional) | Determines the parallelism for deleting marker files, which are used to track all files (valid or invalid/partial) written during a write operation. Increase this value if delays are observed, with large batch writes.

`Config Param: MARKERS_DELETE_PARALLELISM_VALUE` | +| [hoodie.markers.timeline_server_based.batch.interval_ms](#hoodiemarkerstimeline_server_basedbatchinterval_ms) | 50 (Optional) | The batch interval in milliseconds for marker creation batch processing

`Config Param: MARKERS_TIMELINE_SERVER_BASED_BATCH_INTERVAL_MS`
`Since Version: 0.9.0` | +| [hoodie.markers.timeline_server_based.batch.num_threads](#hoodiemarkerstimeline_server_basedbatchnum_threads) | 20 (Optional) | Number of threads to use for batch processing marker creation requests at the timeline server

`Config Param: MARKERS_TIMELINE_SERVER_BASED_BATCH_NUM_THREADS`
`Since Version: 0.9.0` | +| [hoodie.merge.allow.duplicate.on.inserts](#hoodiemergeallowduplicateoninserts) | false (Optional) | When enabled, we allow duplicate keys even if inserts are routed to merge with an existing file (for ensuring file sizing). This is only relevant for insert operation, since upsert, delete operations will ensure unique key constraints are maintained.

`Config Param: MERGE_ALLOW_DUPLICATE_ON_INSERTS_ENABLE` | +| [hoodie.merge.data.validation.enabled](#hoodiemergedatavalidationenabled) | false (Optional) | When enabled, data validation checks are performed during merges to ensure expected number of records after merge operation.

`Config Param: MERGE_DATA_VALIDATION_CHECK_ENABLE` | +| [hoodie.merge.small.file.group.candidates.limit](#hoodiemergesmallfilegroupcandidateslimit) | 1 (Optional) | Limits number of file groups, whose base file satisfies small-file limit, to consider for appending records during upsert operation. Only applicable to MOR tables

`Config Param: MERGE_SMALL_FILE_GROUP_CANDIDATES_LIMIT` | +| [hoodie.release.resource.on.completion.enable](#hoodiereleaseresourceoncompletionenable) | true (Optional) | Control to enable release all persist rdds when the spark job finish.

`Config Param: RELEASE_RESOURCE_ENABLE`
`Since Version: 0.11.0` | +| [hoodie.rollback.instant.backup.dir](#hoodierollbackinstantbackupdir) | .rollback_backup (Optional) | Path where instants being rolled back are copied. If not absolute path then a directory relative to .hoodie folder is created.

`Config Param: ROLLBACK_INSTANT_BACKUP_DIRECTORY` | +| [hoodie.rollback.instant.backup.enabled](#hoodierollbackinstantbackupenabled) | false (Optional) | Backup instants removed during rollback and restore (useful for debugging)

`Config Param: ROLLBACK_INSTANT_BACKUP_ENABLED` | +| [hoodie.rollback.parallelism](#hoodierollbackparallelism) | 100 (Optional) | This config controls the parallelism for rollback of commits. Rollbacks perform deletion of files or logging delete blocks to file groups on storage in parallel. The configure value limits the parallelism so that the number of Spark tasks do not exceed the value. If rollback is slow due to the limited parallelism, you can increase this to tune the performance.

`Config Param: ROLLBACK_PARALLELISM_VALUE` | +| [hoodie.rollback.using.markers](#hoodierollbackusingmarkers) | true (Optional) | Enables a more efficient mechanism for rollbacks based on the marker files generated during the writes. Turned on by default.

`Config Param: ROLLBACK_USING_MARKERS_ENABLE` | +| [hoodie.schema.cache.enable](#hoodieschemacacheenable) | false (Optional) | cache query internalSchemas in driver/executor side

`Config Param: ENABLE_INTERNAL_SCHEMA_CACHE` | +| [hoodie.sensitive.config.keys](#hoodiesensitiveconfigkeys) | ssl,tls,sasl,auth,credentials (Optional) | Comma separated list of filters for sensitive config keys. Hudi Streamer will not print any configuration which contains the configured filter. For example with a configured filter `ssl`, value for config `ssl.trustore.location` would be masked.

`Config Param: SENSITIVE_CONFIG_KEYS_FILTER` | +| [hoodie.skip.default.partition.validation](#hoodieskipdefaultpartitionvalidation) | false (Optional) | When table is upgraded from pre 0.12 to 0.12, we check for "default" partition and fail if found one. Users are expected to rewrite the data in those partitions. Enabling this config will bypass this validation

`Config Param: SKIP_DEFAULT_PARTITION_VALIDATION`
`Since Version: 0.12.0` | +| [hoodie.table.base.file.format](#hoodietablebasefileformat) | PARQUET (Optional) | File format to store all the base file data. org.apache.hudi.common.model.HoodieFileFormat: Hoodie file formats. PARQUET(default): Apache Parquet is an open source, column-oriented data file format designed for efficient data storage and retrieval. It provides efficient data compression and encoding schemes with enhanced performance to handle complex data in bulk. HFILE: (internal config) File format for metadata table. A file of sorted key/value pairs. Both keys and values are byte arrays. ORC: The Optimized Row Columnar (ORC) file format provides a highly efficient way to store Hive data. It was designed to overcome limitations of the other Hive file formats. Using ORC files improves performance when Hive is reading, writing, and processing data.

`Config Param: BASE_FILE_FORMAT` | +| [hoodie.table.services.enabled](#hoodietableservicesenabled) | true (Optional) | Master control to disable all table services including archive, clean, compact, cluster, etc.

`Config Param: TABLE_SERVICES_ENABLED`
`Since Version: 0.11.0` | +| [hoodie.timeline.layout.version](#hoodietimelinelayoutversion) | 1 (Optional) | Controls the layout of the timeline. Version 0 relied on renames, Version 1 (default) models the timeline as an immutable log relying only on atomic writes for object storage.

`Config Param: TIMELINE_LAYOUT_VERSION_NUM`
`Since Version: 0.5.1` | +| [hoodie.upsert.shuffle.parallelism](#hoodieupsertshuffleparallelism) | 0 (Optional) | Parallelism to use for upsert operation on the table. Upserts can shuffle data to perform index lookups, file sizing, bin packing records optimally into file groups. Before 0.13.0 release, if users do not configure it, Hudi would use 200 as the default shuffle parallelism. From 0.13.0 onwards Hudi by default automatically uses the parallelism deduced by Spark based on the source data. If the shuffle parallelism is explicitly configured by the user, the user-configured parallelism is used in defining the actual parallelism. If you observe small files from the upsert operation, we suggest configuring this shuffle parallelism explicitly, so that the parallelism is around total_input_data_size/120MB.

`Config Param: UPSERT_PARALLELISM_VALUE` | +| [hoodie.write.buffer.limit.bytes](#hoodiewritebufferlimitbytes) | 4194304 (Optional) | Size of in-memory buffer used for parallelizing network reads and lake storage writes.

`Config Param: WRITE_BUFFER_LIMIT_BYTES_VALUE` | +| [hoodie.write.concurrency.async.conflict.detector.initial_delay_ms](#hoodiewriteconcurrencyasyncconflictdetectorinitial_delay_ms) | 0 (Optional) | Used for timeline-server-based markers with `AsyncTimelineServerBasedDetectionStrategy`. The time in milliseconds to delay the first execution of async marker-based conflict detection.

`Config Param: ASYNC_CONFLICT_DETECTOR_INITIAL_DELAY_MS`
`Since Version: 0.13.0` | +| [hoodie.write.concurrency.async.conflict.detector.period_ms](#hoodiewriteconcurrencyasyncconflictdetectorperiod_ms) | 30000 (Optional) | Used for timeline-server-based markers with `AsyncTimelineServerBasedDetectionStrategy`. The period in milliseconds between successive executions of async marker-based conflict detection.

`Config Param: ASYNC_CONFLICT_DETECTOR_PERIOD_MS`
`Since Version: 0.13.0` | +| [hoodie.write.concurrency.early.conflict.check.commit.conflict](#hoodiewriteconcurrencyearlyconflictcheckcommitconflict) | false (Optional) | Whether to enable commit conflict checking or not during early conflict detection.

`Config Param: EARLY_CONFLICT_DETECTION_CHECK_COMMIT_CONFLICT`
`Since Version: 0.13.0` | +| [hoodie.write.concurrency.early.conflict.detection.enable](#hoodiewriteconcurrencyearlyconflictdetectionenable) | false (Optional) | Whether to enable early conflict detection based on markers. It eagerly detects writing conflict before create markers and fails fast if a conflict is detected, to release cluster compute resources as soon as possible.

`Config Param: EARLY_CONFLICT_DETECTION_ENABLE`
`Since Version: 0.13.0` | +| [hoodie.write.concurrency.early.conflict.detection.strategy](#hoodiewriteconcurrencyearlyconflictdetectionstrategy) | (Optional) | The class name of the early conflict detection strategy to use. This should be a subclass of `org.apache.hudi.common.conflict.detection.EarlyConflictDetectionStrategy`.

`Config Param: EARLY_CONFLICT_DETECTION_STRATEGY_CLASS_NAME`
`Since Version: 0.13.0` | +| [hoodie.write.executor.disruptor.buffer.limit.bytes](#hoodiewriteexecutordisruptorbufferlimitbytes) | 1024 (Optional) | The size of the Disruptor Executor ring buffer, must be power of 2

`Config Param: WRITE_EXECUTOR_DISRUPTOR_BUFFER_LIMIT_BYTES`
`Since Version: 0.13.0` | +| [hoodie.write.executor.disruptor.wait.strategy](#hoodiewriteexecutordisruptorwaitstrategy) | BLOCKING_WAIT (Optional) | org.apache.hudi.common.util.queue.DisruptorWaitStrategyType: Strategy employed for making Disruptor Executor wait on a cursor. BLOCKING_WAIT(default): The slowest of the available wait strategies. However, it is the most conservative with the respect to CPU usage and will give the most consistent behaviour across the widest variety of deployment options. SLEEPING_WAIT: Like the `BLOCKING_WAIT` strategy, it attempts to be conservative with CPU usage by using a simple busy wait loop. The difference is that the `SLEEPING_WAIT` strategy uses a call to `LockSupport.parkNanos(1)` in the middle of the loop. On a typical Linux system this will pause the thread for around 60µs. YIELDING_WAIT: The `YIELDING_WAIT` strategy is one of two wait strategy that can be used in low-latency systems. It is designed for cases where there is an opportunity to burn CPU cycles with the goal of improving latency. The `YIELDING_WAIT` strategy will busy spin, waiting for the sequence to increment to the appropriate value. Inside the body of the loop `Thread#yield()` will be called allowing other queued threads to run. This is the recommended wait strategy when you need very high performance, and the number of `EventHandler` threads is lower than the total number of logical cores, such as when hyper-threading is enabled. BUSY_SPIN_WAIT: The `BUSY_SPIN_WAIT` strategy is the highest performing wait strategy. Like the `YIELDING_WAIT` strategy, it can be used in low-latency systems, but puts the highest constraints on the deployment environment.

`Config Param: WRITE_EXECUTOR_DISRUPTOR_WAIT_STRATEGY`
`Since Version: 0.13.0` | +| [hoodie.write.executor.type](#hoodiewriteexecutortype) | SIMPLE (Optional) | org.apache.hudi.common.util.queue.ExecutorType: Types of executor that implements org.apache.hudi.common.util.queue.HoodieExecutor. The executor orchestrates concurrent producers and consumers communicating through a message queue. BOUNDED_IN_MEMORY: Executor which orchestrates concurrent producers and consumers communicating through a bounded in-memory message queue using LinkedBlockingQueue. This queue will use extra lock to balance producers and consumers. DISRUPTOR: Executor which orchestrates concurrent producers and consumers communicating through disruptor as a lock free message queue to gain better writing performance. Although DisruptorExecutor is still an experimental feature. SIMPLE(default): Executor with no inner message queue and no inner lock. Consuming and writing records from iterator directly. The advantage is that there is no need for additional memory and cpu resources due to lock or multithreading. The disadvantage is that the executor is a single-write-single-read model, cannot support functions such as speed limit and can not de-couple the network read (shuffle read) and network write (writing objects/files to storage) anymore.

`Config Param: WRITE_EXECUTOR_TYPE`
`Since Version: 0.13.0` | +| [hoodie.write.markers.type](#hoodiewritemarkerstype) | TIMELINE_SERVER_BASED (Optional) | org.apache.hudi.common.table.marker.MarkerType: Marker type indicating how markers are stored in the file system, used for identifying the files written and cleaning up files not committed which should be deleted. DIRECT: Individual marker file corresponding to each data file is directly created by the writer. TIMELINE_SERVER_BASED(default): Marker operations are all handled at the timeline service which serves as a proxy. New marker entries are batch processed and stored in a limited number of underlying files for efficiency. If HDFS is used or timeline server is disabled, DIRECT markers are used as fallback even if this is configured. This configuration does not take effect for Spark structured streaming; DIRECT markers are always used.

`Config Param: MARKERS_TYPE`
`Since Version: 0.9.0` | +| [hoodie.write.status.storage.level](#hoodiewritestatusstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Write status objects hold metadata about a write (stats, errors), that is not yet committed to storage. This controls the how that information is cached for inspection by clients. We rarely expect this to be changed.

`Config Param: WRITE_STATUS_STORAGE_LEVEL_VALUE` | +| [hoodie.write.tagged.record.storage.level](#hoodiewritetaggedrecordstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Determine what level of persistence is used to cache write RDDs. Refer to org.apache.spark.storage.StorageLevel for different values

`Config Param: TAGGED_RECORD_STORAGE_LEVEL_VALUE` | +| [hoodie.writestatus.class](#hoodiewritestatusclass) | org.apache.hudi.client.WriteStatus (Optional) | Subclass of org.apache.hudi.client.WriteStatus to be used to collect information about a write. Can be overridden to collection additional metrics/statistics about the data if needed.

`Config Param: WRITE_STATUS_CLASS_NAME` | --- @@ -854,13 +854,13 @@ Configurations controlling callback behavior into HTTP endpoints, to push notifi [**Advanced Configs**](#Write-commit-callback-configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.write.commit.callback.http.url](#hoodiewritecommitcallbackhttpurl) | N/A **(Required)** | Callback host to be sent along with callback messages | 0.6.0 | -| [hoodie.write.commit.callback.class](#hoodiewritecommitcallbackclass) | org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback (Optional) | Full path of callback class and must be a subclass of HoodieWriteCommitCallback class, org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback by default | 0.6.0 | -| [hoodie.write.commit.callback.http.api.key](#hoodiewritecommitcallbackhttpapikey) | hudi_write_commit_http_callback (Optional) | Http callback API key. hudi_write_commit_http_callback by default | 0.6.0 | -| [hoodie.write.commit.callback.http.timeout.seconds](#hoodiewritecommitcallbackhttptimeoutseconds) | 30 (Optional) | Callback timeout in seconds. | 0.6.0 | -| [hoodie.write.commit.callback.on](#hoodiewritecommitcallbackon) | false (Optional) | Turn commit callback on/off. off by default. | 0.6.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.write.commit.callback.http.url](#hoodiewritecommitcallbackhttpurl) | N/A **(Required)** | Callback host to be sent along with callback messages

`Config Param: CALLBACK_HTTP_URL`
`Since Version: 0.6.0` | +| [hoodie.write.commit.callback.class](#hoodiewritecommitcallbackclass) | org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback (Optional) | Full path of callback class and must be a subclass of HoodieWriteCommitCallback class, org.apache.hudi.callback.impl.HoodieWriteCommitHttpCallback by default

`Config Param: CALLBACK_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.write.commit.callback.http.api.key](#hoodiewritecommitcallbackhttpapikey) | hudi_write_commit_http_callback (Optional) | Http callback API key. hudi_write_commit_http_callback by default

`Config Param: CALLBACK_HTTP_API_KEY_VALUE`
`Since Version: 0.6.0` | +| [hoodie.write.commit.callback.http.timeout.seconds](#hoodiewritecommitcallbackhttptimeoutseconds) | 30 (Optional) | Callback timeout in seconds.

`Config Param: CALLBACK_HTTP_TIMEOUT_IN_SECONDS`
`Since Version: 0.6.0` | +| [hoodie.write.commit.callback.on](#hoodiewritecommitcallbackon) | false (Optional) | Turn commit callback on/off. off by default.

`Config Param: TURN_CALLBACK_ON`
`Since Version: 0.6.0` | --- @@ -872,13 +872,13 @@ Controls notifications sent to Kafka, on events happening to a hudi table. [**Advanced Configs**](#Write-commit-Kafka-callback-configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.write.commit.callback.kafka.bootstrap.servers](#hoodiewritecommitcallbackkafkabootstrapservers) | N/A **(Required)** | Bootstrap servers of kafka cluster, to be used for publishing commit metadata. | 0.7.0 | -| [hoodie.write.commit.callback.kafka.partition](#hoodiewritecommitcallbackkafkapartition) | N/A **(Required)** | It may be desirable to serialize all changes into a single Kafka partition for providing strict ordering. By default, Kafka messages are keyed by table name, which guarantees ordering at the table level, but not globally (or when new partitions are added) | 0.7.0 | -| [hoodie.write.commit.callback.kafka.topic](#hoodiewritecommitcallbackkafkatopic) | N/A **(Required)** | Kafka topic name to publish timeline activity into. | 0.7.0 | -| [hoodie.write.commit.callback.kafka.acks](#hoodiewritecommitcallbackkafkaacks) | all (Optional) | kafka acks level, all by default to ensure strong durability. | 0.7.0 | -| [hoodie.write.commit.callback.kafka.retries](#hoodiewritecommitcallbackkafkaretries) | 3 (Optional) | Times to retry the produce. 3 by default | 0.7.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------- | ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.write.commit.callback.kafka.bootstrap.servers](#hoodiewritecommitcallbackkafkabootstrapservers) | N/A **(Required)** | Bootstrap servers of kafka cluster, to be used for publishing commit metadata.

`Config Param: BOOTSTRAP_SERVERS`
`Since Version: 0.7.0` | +| [hoodie.write.commit.callback.kafka.partition](#hoodiewritecommitcallbackkafkapartition) | N/A **(Required)** | It may be desirable to serialize all changes into a single Kafka partition for providing strict ordering. By default, Kafka messages are keyed by table name, which guarantees ordering at the table level, but not globally (or when new partitions are added)

`Config Param: PARTITION`
`Since Version: 0.7.0` | +| [hoodie.write.commit.callback.kafka.topic](#hoodiewritecommitcallbackkafkatopic) | N/A **(Required)** | Kafka topic name to publish timeline activity into.

`Config Param: TOPIC`
`Since Version: 0.7.0` | +| [hoodie.write.commit.callback.kafka.acks](#hoodiewritecommitcallbackkafkaacks) | all (Optional) | kafka acks level, all by default to ensure strong durability.

`Config Param: ACKS`
`Since Version: 0.7.0` | +| [hoodie.write.commit.callback.kafka.retries](#hoodiewritecommitcallbackkafkaretries) | 3 (Optional) | Times to retry the produce. 3 by default

`Config Param: RETRIES`
`Since Version: 0.7.0` | --- @@ -890,19 +890,19 @@ Controls notifications sent to pulsar, on events happening to a hudi table. [**Advanced Configs**](#Write-commit-pulsar-callback-configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------ | -------------------------------------------------------------------------------- | ------------- | -| [hoodie.write.commit.callback.pulsar.broker.service.url](#hoodiewritecommitcallbackpulsarbrokerserviceurl) | N/A **(Required)** | Server's url of pulsar cluster, to be used for publishing commit metadata. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.topic](#hoodiewritecommitcallbackpulsartopic) | N/A **(Required)** | pulsar topic name to publish timeline activity into. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.connection-timeout](#hoodiewritecommitcallbackpulsarconnection-timeout) | 10s (Optional) | Duration of waiting for a connection to a broker to be established. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.keepalive-interval](#hoodiewritecommitcallbackpulsarkeepalive-interval) | 30s (Optional) | Duration of keeping alive interval for each client broker connection. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.operation-timeout](#hoodiewritecommitcallbackpulsaroperation-timeout) | 30s (Optional) | Duration of waiting for completing an operation. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.producer.block-if-queue-full](#hoodiewritecommitcallbackpulsarproducerblock-if-queue-full) | true (Optional) | When the queue is full, the method is blocked instead of an exception is thrown. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.producer.pending-queue-size](#hoodiewritecommitcallbackpulsarproducerpending-queue-size) | 1000 (Optional) | The maximum size of a queue holding pending messages. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.producer.pending-total-size](#hoodiewritecommitcallbackpulsarproducerpending-total-size) | 50000 (Optional) | The maximum number of pending messages across partitions. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.producer.route-mode](#hoodiewritecommitcallbackpulsarproducerroute-mode) | RoundRobinPartition (Optional) | Message routing logic for producers on partitioned topics. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.producer.send-timeout](#hoodiewritecommitcallbackpulsarproducersend-timeout) | 30s (Optional) | The timeout in each sending to pulsar. | 0.11.0 | -| [hoodie.write.commit.callback.pulsar.request-timeout](#hoodiewritecommitcallbackpulsarrequest-timeout) | 60s (Optional) | Duration of waiting for completing a request. | 0.11.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.write.commit.callback.pulsar.broker.service.url](#hoodiewritecommitcallbackpulsarbrokerserviceurl) | N/A **(Required)** | Server's url of pulsar cluster, to be used for publishing commit metadata.

`Config Param: BROKER_SERVICE_URL`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.topic](#hoodiewritecommitcallbackpulsartopic) | N/A **(Required)** | pulsar topic name to publish timeline activity into.

`Config Param: TOPIC`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.connection-timeout](#hoodiewritecommitcallbackpulsarconnection-timeout) | 10s (Optional) | Duration of waiting for a connection to a broker to be established.

`Config Param: CONNECTION_TIMEOUT`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.keepalive-interval](#hoodiewritecommitcallbackpulsarkeepalive-interval) | 30s (Optional) | Duration of keeping alive interval for each client broker connection.

`Config Param: KEEPALIVE_INTERVAL`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.operation-timeout](#hoodiewritecommitcallbackpulsaroperation-timeout) | 30s (Optional) | Duration of waiting for completing an operation.

`Config Param: OPERATION_TIMEOUT`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.block-if-queue-full](#hoodiewritecommitcallbackpulsarproducerblock-if-queue-full) | true (Optional) | When the queue is full, the method is blocked instead of an exception is thrown.

`Config Param: PRODUCER_BLOCK_QUEUE_FULL`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.pending-queue-size](#hoodiewritecommitcallbackpulsarproducerpending-queue-size) | 1000 (Optional) | The maximum size of a queue holding pending messages.

`Config Param: PRODUCER_PENDING_QUEUE_SIZE`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.pending-total-size](#hoodiewritecommitcallbackpulsarproducerpending-total-size) | 50000 (Optional) | The maximum number of pending messages across partitions.

`Config Param: PRODUCER_PENDING_SIZE`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.route-mode](#hoodiewritecommitcallbackpulsarproducerroute-mode) | RoundRobinPartition (Optional) | Message routing logic for producers on partitioned topics.

`Config Param: PRODUCER_ROUTE_MODE`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.producer.send-timeout](#hoodiewritecommitcallbackpulsarproducersend-timeout) | 30s (Optional) | The timeout in each sending to pulsar.

`Config Param: PRODUCER_SEND_TIMEOUT`
`Since Version: 0.11.0` | +| [hoodie.write.commit.callback.pulsar.request-timeout](#hoodiewritecommitcallbackpulsarrequest-timeout) | 60s (Optional) | Duration of waiting for completing a request.

`Config Param: REQUEST_TIMEOUT`
`Since Version: 0.11.0` | --- @@ -918,27 +918,27 @@ Configurations that control locking mechanisms required for concurrency control [**Advanced Configs**](#Common-Lock-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.write.lock.filesystem.path](#hoodiewritelockfilesystempath) | N/A **(Required)** | For DFS based lock providers, path to store the locks under. use Table's meta path as default | 0.8.0 | -| [hoodie.write.lock.hivemetastore.database](#hoodiewritelockhivemetastoredatabase) | N/A **(Required)** | For Hive based lock provider, the Hive database to acquire lock against | 0.8.0 | -| [hoodie.write.lock.hivemetastore.table](#hoodiewritelockhivemetastoretable) | N/A **(Required)** | For Hive based lock provider, the Hive table to acquire lock against | 0.8.0 | -| [hoodie.write.lock.hivemetastore.uris](#hoodiewritelockhivemetastoreuris) | N/A **(Required)** | For Hive based lock provider, the Hive metastore URI to acquire locks against. | 0.8.0 | -| [hoodie.write.lock.zookeeper.base_path](#hoodiewritelockzookeeperbase_path) | N/A **(Required)** | The base path on Zookeeper under which to create lock related ZNodes. This should be same for all concurrent writers to the same table | 0.8.0 | -| [hoodie.write.lock.zookeeper.port](#hoodiewritelockzookeeperport) | N/A **(Required)** | Zookeeper port to connect to. | 0.8.0 | -| [hoodie.write.lock.zookeeper.url](#hoodiewritelockzookeeperurl) | N/A **(Required)** | Zookeeper URL to connect to. | 0.8.0 | -| [hoodie.write.lock.client.num_retries](#hoodiewritelockclientnum_retries) | 50 (Optional) | Maximum number of times to retry to acquire lock additionally from the lock manager. | 0.8.0 | -| [hoodie.write.lock.client.wait_time_ms_between_retry](#hoodiewritelockclientwait_time_ms_between_retry) | 5000 (Optional) | Amount of time to wait between retries on the lock provider by the lock manager | 0.8.0 | -| [hoodie.write.lock.conflict.resolution.strategy](#hoodiewritelockconflictresolutionstrategy) | org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy (Optional) | Lock provider class name, this should be subclass of org.apache.hudi.client.transaction.ConflictResolutionStrategy | 0.8.0 | -| [hoodie.write.lock.filesystem.expire](#hoodiewritelockfilesystemexpire) | 0 (Optional) | For DFS based lock providers, expire time in minutes, must be a non-negative number, default means no expire | 0.12.0 | -| [hoodie.write.lock.max_wait_time_ms_between_retry](#hoodiewritelockmax_wait_time_ms_between_retry) | 16000 (Optional) | Maximum amount of time to wait between retries by lock provider client. This bounds the maximum delay from the exponential backoff. Currently used by ZK based lock provider only. | 0.8.0 | -| [hoodie.write.lock.num_retries](#hoodiewritelocknum_retries) | 15 (Optional) | Maximum number of times to retry lock acquire, at each lock provider | 0.8.0 | -| [hoodie.write.lock.provider](#hoodiewritelockprovider) | org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider (Optional) | Lock provider class name, user can provide their own implementation of LockProvider which should be subclass of org.apache.hudi.common.lock.LockProvider | 0.8.0 | -| [hoodie.write.lock.wait_time_ms](#hoodiewritelockwait_time_ms) | 60000 (Optional) | Timeout in ms, to wait on an individual lock acquire() call, at the lock provider. | 0.8.0 | -| [hoodie.write.lock.wait_time_ms_between_retry](#hoodiewritelockwait_time_ms_between_retry) | 1000 (Optional) | Initial amount of time to wait between retries to acquire locks, subsequent retries will exponentially backoff. | 0.8.0 | -| [hoodie.write.lock.zookeeper.connection_timeout_ms](#hoodiewritelockzookeeperconnection_timeout_ms) | 15000 (Optional) | Timeout in ms, to wait for establishing connection with Zookeeper. | 0.8.0 | -| [hoodie.write.lock.zookeeper.lock_key](#hoodiewritelockzookeeperlock_key) | (Optional) | Key name under base_path at which to create a ZNode and acquire lock. Final path on zk will look like base_path/lock_key. If this parameter is not set, we would set it as the table name | 0.8.0 | -| [hoodie.write.lock.zookeeper.session_timeout_ms](#hoodiewritelockzookeepersession_timeout_ms) | 60000 (Optional) | Timeout in ms, to wait after losing connection to ZooKeeper, before the session is expired | 0.8.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.write.lock.filesystem.path](#hoodiewritelockfilesystempath) | N/A **(Required)** | For DFS based lock providers, path to store the locks under. use Table's meta path as default

`Config Param: FILESYSTEM_LOCK_PATH`
`Since Version: 0.8.0` | +| [hoodie.write.lock.hivemetastore.database](#hoodiewritelockhivemetastoredatabase) | N/A **(Required)** | For Hive based lock provider, the Hive database to acquire lock against

`Config Param: HIVE_DATABASE_NAME`
`Since Version: 0.8.0` | +| [hoodie.write.lock.hivemetastore.table](#hoodiewritelockhivemetastoretable) | N/A **(Required)** | For Hive based lock provider, the Hive table to acquire lock against

`Config Param: HIVE_TABLE_NAME`
`Since Version: 0.8.0` | +| [hoodie.write.lock.hivemetastore.uris](#hoodiewritelockhivemetastoreuris) | N/A **(Required)** | For Hive based lock provider, the Hive metastore URI to acquire locks against.

`Config Param: HIVE_METASTORE_URI`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.base_path](#hoodiewritelockzookeeperbase_path) | N/A **(Required)** | The base path on Zookeeper under which to create lock related ZNodes. This should be same for all concurrent writers to the same table

`Config Param: ZK_BASE_PATH`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.port](#hoodiewritelockzookeeperport) | N/A **(Required)** | Zookeeper port to connect to.

`Config Param: ZK_PORT`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.url](#hoodiewritelockzookeeperurl) | N/A **(Required)** | Zookeeper URL to connect to.

`Config Param: ZK_CONNECT_URL`
`Since Version: 0.8.0` | +| [hoodie.write.lock.client.num_retries](#hoodiewritelockclientnum_retries) | 50 (Optional) | Maximum number of times to retry to acquire lock additionally from the lock manager.

`Config Param: LOCK_ACQUIRE_CLIENT_NUM_RETRIES`
`Since Version: 0.8.0` | +| [hoodie.write.lock.client.wait_time_ms_between_retry](#hoodiewritelockclientwait_time_ms_between_retry) | 5000 (Optional) | Amount of time to wait between retries on the lock provider by the lock manager

`Config Param: LOCK_ACQUIRE_CLIENT_RETRY_WAIT_TIME_IN_MILLIS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.conflict.resolution.strategy](#hoodiewritelockconflictresolutionstrategy) | org.apache.hudi.client.transaction.SimpleConcurrentFileWritesConflictResolutionStrategy (Optional) | Lock provider class name, this should be subclass of org.apache.hudi.client.transaction.ConflictResolutionStrategy

`Config Param: WRITE_CONFLICT_RESOLUTION_STRATEGY_CLASS_NAME`
`Since Version: 0.8.0` | +| [hoodie.write.lock.filesystem.expire](#hoodiewritelockfilesystemexpire) | 0 (Optional) | For DFS based lock providers, expire time in minutes, must be a non-negative number, default means no expire

`Config Param: FILESYSTEM_LOCK_EXPIRE`
`Since Version: 0.12.0` | +| [hoodie.write.lock.max_wait_time_ms_between_retry](#hoodiewritelockmax_wait_time_ms_between_retry) | 16000 (Optional) | Maximum amount of time to wait between retries by lock provider client. This bounds the maximum delay from the exponential backoff. Currently used by ZK based lock provider only.

`Config Param: LOCK_ACQUIRE_RETRY_MAX_WAIT_TIME_IN_MILLIS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.num_retries](#hoodiewritelocknum_retries) | 15 (Optional) | Maximum number of times to retry lock acquire, at each lock provider

`Config Param: LOCK_ACQUIRE_NUM_RETRIES`
`Since Version: 0.8.0` | +| [hoodie.write.lock.provider](#hoodiewritelockprovider) | org.apache.hudi.client.transaction.lock.ZookeeperBasedLockProvider (Optional) | Lock provider class name, user can provide their own implementation of LockProvider which should be subclass of org.apache.hudi.common.lock.LockProvider

`Config Param: LOCK_PROVIDER_CLASS_NAME`
`Since Version: 0.8.0` | +| [hoodie.write.lock.wait_time_ms](#hoodiewritelockwait_time_ms) | 60000 (Optional) | Timeout in ms, to wait on an individual lock acquire() call, at the lock provider.

`Config Param: LOCK_ACQUIRE_WAIT_TIMEOUT_MS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.wait_time_ms_between_retry](#hoodiewritelockwait_time_ms_between_retry) | 1000 (Optional) | Initial amount of time to wait between retries to acquire locks, subsequent retries will exponentially backoff.

`Config Param: LOCK_ACQUIRE_RETRY_WAIT_TIME_IN_MILLIS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.connection_timeout_ms](#hoodiewritelockzookeeperconnection_timeout_ms) | 15000 (Optional) | Timeout in ms, to wait for establishing connection with Zookeeper.

`Config Param: ZK_CONNECTION_TIMEOUT_MS`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.lock_key](#hoodiewritelockzookeeperlock_key) | (Optional) | Key name under base_path at which to create a ZNode and acquire lock. Final path on zk will look like base_path/lock_key. If this parameter is not set, we would set it as the table name

`Config Param: ZK_LOCK_KEY`
`Since Version: 0.8.0` | +| [hoodie.write.lock.zookeeper.session_timeout_ms](#hoodiewritelockzookeepersession_timeout_ms) | 60000 (Optional) | Timeout in ms, to wait after losing connection to ZooKeeper, before the session is expired

`Config Param: ZK_SESSION_TIMEOUT_MS`
`Since Version: 0.8.0` | --- @@ -950,17 +950,17 @@ Configs that control DynamoDB based locking mechanisms required for concurrency [**Advanced Configs**](#DynamoDB-based-Locks-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | ------------- | -| [hoodie.write.lock.dynamodb.endpoint_url](#hoodiewritelockdynamodbendpoint_url) | N/A **(Required)** | For DynamoDB based lock provider, the url endpoint used for Amazon DynamoDB service. Useful for development with a local dynamodb instance. | 0.10.1 | -| [hoodie.write.lock.dynamodb.billing_mode](#hoodiewritelockdynamodbbilling_mode) | PAY_PER_REQUEST (Optional) | For DynamoDB based lock provider, by default it is `PAY_PER_REQUEST` mode. Alternative is `PROVISIONED`. | 0.10.0 | -| [hoodie.write.lock.dynamodb.partition_key](#hoodiewritelockdynamodbpartition_key) | (Optional) | For DynamoDB based lock provider, the partition key for the DynamoDB lock table. Each Hudi dataset should has it's unique key so concurrent writers could refer to the same partition key. By default we use the Hudi table name specified to be the partition key | 0.10.0 | -| [hoodie.write.lock.dynamodb.read_capacity](#hoodiewritelockdynamodbread_capacity) | 20 (Optional) | For DynamoDB based lock provider, read capacity units when using PROVISIONED billing mode | 0.10.0 | -| [hoodie.write.lock.dynamodb.region](#hoodiewritelockdynamodbregion) | us-east-1 (Optional) | For DynamoDB based lock provider, the region used in endpoint for Amazon DynamoDB service. Would try to first get it from AWS_REGION environment variable. If not find, by default use us-east-1 | 0.10.0 | -| [hoodie.write.lock.dynamodb.table](#hoodiewritelockdynamodbtable) | hudi_locks (Optional) | For DynamoDB based lock provider, the name of the DynamoDB table acting as lock table | 0.10.0 | -| [hoodie.write.lock.dynamodb.table_creation_timeout](#hoodiewritelockdynamodbtable_creation_timeout) | 120000 (Optional) | For DynamoDB based lock provider, the maximum number of milliseconds to wait for creating DynamoDB table | 0.10.0 | -| [hoodie.write.lock.dynamodb.write_capacity](#hoodiewritelockdynamodbwrite_capacity) | 10 (Optional) | For DynamoDB based lock provider, write capacity units when using PROVISIONED billing mode | 0.10.0 | -| [hoodie.write.lock.wait_time_ms](#hoodiewritelockwait_time_ms) | 60000 (Optional) | Lock Acquire Wait Timeout in milliseconds | 0.10.0 | +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------- | -------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.write.lock.dynamodb.endpoint_url](#hoodiewritelockdynamodbendpoint_url) | N/A **(Required)** | For DynamoDB based lock provider, the url endpoint used for Amazon DynamoDB service. Useful for development with a local dynamodb instance.

`Config Param: DYNAMODB_ENDPOINT_URL`
`Since Version: 0.10.1` | +| [hoodie.write.lock.dynamodb.billing_mode](#hoodiewritelockdynamodbbilling_mode) | PAY_PER_REQUEST (Optional) | For DynamoDB based lock provider, by default it is `PAY_PER_REQUEST` mode. Alternative is `PROVISIONED`.

`Config Param: DYNAMODB_LOCK_BILLING_MODE`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.partition_key](#hoodiewritelockdynamodbpartition_key) | (Optional) | For DynamoDB based lock provider, the partition key for the DynamoDB lock table. Each Hudi dataset should has it's unique key so concurrent writers could refer to the same partition key. By default we use the Hudi table name specified to be the partition key

`Config Param: DYNAMODB_LOCK_PARTITION_KEY`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.read_capacity](#hoodiewritelockdynamodbread_capacity) | 20 (Optional) | For DynamoDB based lock provider, read capacity units when using PROVISIONED billing mode

`Config Param: DYNAMODB_LOCK_READ_CAPACITY`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.region](#hoodiewritelockdynamodbregion) | us-east-1 (Optional) | For DynamoDB based lock provider, the region used in endpoint for Amazon DynamoDB service. Would try to first get it from AWS_REGION environment variable. If not find, by default use us-east-1

`Config Param: DYNAMODB_LOCK_REGION`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.table](#hoodiewritelockdynamodbtable) | hudi_locks (Optional) | For DynamoDB based lock provider, the name of the DynamoDB table acting as lock table

`Config Param: DYNAMODB_LOCK_TABLE_NAME`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.table_creation_timeout](#hoodiewritelockdynamodbtable_creation_timeout) | 120000 (Optional) | For DynamoDB based lock provider, the maximum number of milliseconds to wait for creating DynamoDB table

`Config Param: DYNAMODB_LOCK_TABLE_CREATION_TIMEOUT`
`Since Version: 0.10.0` | +| [hoodie.write.lock.dynamodb.write_capacity](#hoodiewritelockdynamodbwrite_capacity) | 10 (Optional) | For DynamoDB based lock provider, write capacity units when using PROVISIONED billing mode

`Config Param: DYNAMODB_LOCK_WRITE_CAPACITY`
`Since Version: 0.10.0` | +| [hoodie.write.lock.wait_time_ms](#hoodiewritelockwait_time_ms) | 60000 (Optional) | Lock Acquire Wait Timeout in milliseconds

`Config Param: LOCK_ACQUIRE_WAIT_TIMEOUT_MS_PROP_KEY`
`Since Version: 0.10.0` | --- @@ -976,19 +976,19 @@ Hudi maintains keys (record key + partition path) for uniquely identifying a par [**Basic Configs**](#Key-Generator-Options-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------ | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | N/A **(Required)** | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString() | | -| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | N/A **(Required)** | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c` | | -| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values) | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------ | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.write.partitionpath.field](#hoodiedatasourcewritepartitionpathfield) | N/A **(Required)** | Partition path field. Value to be used at the partitionPath component of HoodieKey. Actual value obtained by invoking .toString()

`Config Param: PARTITIONPATH_FIELD_NAME` | +| [hoodie.datasource.write.recordkey.field](#hoodiedatasourcewriterecordkeyfield) | N/A **(Required)** | Record key field. Value to be used as the `recordKey` component of `HoodieKey`. Actual value will be obtained by invoking .toString() on the field value. Nested fields can be specified using the dot notation eg: `a.b.c`

`Config Param: RECORDKEY_FIELD_NAME` | +| [hoodie.datasource.write.hive_style_partitioning](#hoodiedatasourcewritehive_style_partitioning) | false (Optional) | Flag to indicate whether to use Hive style partitioning. If set true, the names of partition folders follow <partition_column_name>=<partition_value> format. By default false (the names of partition folders are only partition values)

`Config Param: HIVE_STYLE_PARTITIONING_ENABLE` | [**Advanced Configs**](#Key-Generator-Options-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled](#hoodiedatasourcewritekeygeneratorconsistentlogicaltimestampenabled) | false (Optional) | When set to true, consistent value will be generated for a logical timestamp type column, like timestamp-millis and timestamp-micros, irrespective of whether row-writer is enabled. Disabled by default so as not to break the pipeline that deploy either fully row-writer path or non row-writer path. For example, if it is kept disabled then record key of timestamp type with value `2016-12-29 09:54:00` will be written as timestamp `2016-12-29 09:54:00.0` in row-writer path, while it will be written as long value `1483023240000000` in non row-writer path. If enabled, then the timestamp value will be written in both the cases. | | -| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false (Optional) | Should we url encode the partition path value, before creating the folder structure. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------------------------------------------ | ---------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.write.keygenerator.consistent.logical.timestamp.enabled](#hoodiedatasourcewritekeygeneratorconsistentlogicaltimestampenabled) | false (Optional) | When set to true, consistent value will be generated for a logical timestamp type column, like timestamp-millis and timestamp-micros, irrespective of whether row-writer is enabled. Disabled by default so as not to break the pipeline that deploy either fully row-writer path or non row-writer path. For example, if it is kept disabled then record key of timestamp type with value `2016-12-29 09:54:00` will be written as timestamp `2016-12-29 09:54:00.0` in row-writer path, while it will be written as long value `1483023240000000` in non row-writer path. If enabled, then the timestamp value will be written in both the cases.

`Config Param: KEYGENERATOR_CONSISTENT_LOGICAL_TIMESTAMP_ENABLED` | +| [hoodie.datasource.write.partitionpath.urlencode](#hoodiedatasourcewritepartitionpathurlencode) | false (Optional) | Should we url encode the partition path value, before creating the folder structure.

`Config Param: URL_ENCODE_PARTITIONING` | --- @@ -1004,45 +1004,45 @@ Configurations that control indexing behavior, which tags incoming records as ei [**Basic Configs**](#Common-Index-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------- | ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.index.type](#hoodieindextype) | N/A **(Required)** | org.apache.hudi.index.HoodieIndex$IndexType: Determines how input records are indexed, i.e., looked up based on the key for the location in the existing table. Default is SIMPLE on Spark engine, and INMEMORY on Flink and Java engines. HBASE: uses an external managed Apache HBase table to store record key to location mapping. HBase index is a global index, enforcing key uniqueness across all partitions in the table. INMEMORY: Uses in-memory hashmap in Spark and Java engine and Flink in-memory state in Flink for indexing. BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced inside partitions. GLOBAL_BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced across all partitions in the table. SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced inside partitions. GLOBAL_SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced across all partitions in the table. BUCKET: locates the file group containing the record fast by using bucket hashing, particularly beneficial in large scale. Use `hoodie.index.bucket.engine` to choose bucket engine type, i.e., how buckets are generated. FLINK_STATE: Internal Config for indexing based on Flink state. RECORD_INDEX: Index which saves the record key to location mappings in the HUDI Metadata Table. Record index is a global index, enforcing key uniqueness across all partitions in the table. Supports sharding to achieve very high scale. | | +| Config Name | Default | Description | +| ------------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.index.type](#hoodieindextype) | N/A **(Required)** | org.apache.hudi.index.HoodieIndex$IndexType: Determines how input records are indexed, i.e., looked up based on the key for the location in the existing table. Default is SIMPLE on Spark engine, and INMEMORY on Flink and Java engines. HBASE: uses an external managed Apache HBase table to store record key to location mapping. HBase index is a global index, enforcing key uniqueness across all partitions in the table. INMEMORY: Uses in-memory hashmap in Spark and Java engine and Flink in-memory state in Flink for indexing. BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced inside partitions. GLOBAL_BLOOM: Employs bloom filters built out of the record keys, optionally also pruning candidate files using record key ranges. Key uniqueness is enforced across all partitions in the table. SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced inside partitions. GLOBAL_SIMPLE: Performs a lean join of the incoming update/delete records against keys extracted from the table on storage.Key uniqueness is enforced across all partitions in the table. BUCKET: locates the file group containing the record fast by using bucket hashing, particularly beneficial in large scale. Use `hoodie.index.bucket.engine` to choose bucket engine type, i.e., how buckets are generated. FLINK_STATE: Internal Config for indexing based on Flink state. RECORD_INDEX: Index which saves the record key to location mappings in the HUDI Metadata Table. Record index is a global index, enforcing key uniqueness across all partitions in the table. Supports sharding to achieve very high scale.

`Config Param: INDEX_TYPE` | [**Advanced Configs**](#Common-Index-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------- | ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.bucket.index.hash.field](#hoodiebucketindexhashfield) | N/A **(Required)** | Index key. It is used to index the record and find its file group. If not set, use record key field as default | | -| [hoodie.bucket.index.max.num.buckets](#hoodiebucketindexmaxnumbuckets) | N/A **(Required)** | Only applies if bucket index engine is consistent hashing. Determine the upper bound of the number of buckets in the hudi table. Bucket resizing cannot be done higher than this max limit. | 0.13.0 | -| [hoodie.bucket.index.min.num.buckets](#hoodiebucketindexminnumbuckets) | N/A **(Required)** | Only applies if bucket index engine is consistent hashing. Determine the lower bound of the number of buckets in the hudi table. Bucket resizing cannot be done lower than this min limit. | 0.13.0 | -| [hoodie.bloom.index.bucketized.checking](#hoodiebloomindexbucketizedchecking) | true (Optional) | Only applies if index type is BLOOM. When true, bucketized bloom filtering is enabled. This reduces skew seen in sort based bloom index lookup | | -| [hoodie.bloom.index.filter.dynamic.max.entries](#hoodiebloomindexfilterdynamicmaxentries) | 100000 (Optional) | The threshold for the maximum number of keys to record in a dynamic Bloom filter row. Only applies if filter type is BloomFilterTypeCode.DYNAMIC_V0. | | -| [hoodie.bloom.index.filter.type](#hoodiebloomindexfiltertype) | DYNAMIC_V0 (Optional) | org.apache.hudi.common.bloom.BloomFilterTypeCode: Filter type used by Bloom filter. SIMPLE: Bloom filter that is based on the configured size. DYNAMIC_V0(default): Bloom filter that is auto sized based on number of keys. | | -| [hoodie.bloom.index.input.storage.level](#hoodiebloomindexinputstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Only applies when #bloomIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values | | -| [hoodie.bloom.index.keys.per.bucket](#hoodiebloomindexkeysperbucket) | 10000000 (Optional) | Only applies if bloomIndexBucketizedChecking is enabled and index type is bloom. This configuration controls the “bucket” size which tracks the number of record-key checks made against a single file and is the unit of work allocated to each partition performing bloom filter lookup. A higher value would amortize the fixed cost of reading a bloom filter to memory. | | -| [hoodie.bloom.index.parallelism](#hoodiebloomindexparallelism) | 0 (Optional) | Only applies if index type is BLOOM. This is the amount of parallelism for index lookup, which involves a shuffle. By default, this is auto computed based on input workload characteristics. If the parallelism is explicitly configured by the user, the user-configured value is used in defining the actual parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance. | | -| [hoodie.bloom.index.prune.by.ranges](#hoodiebloomindexprunebyranges) | true (Optional) | Only applies if index type is BLOOM. When true, range information from files to leveraged speed up index lookups. Particularly helpful, if the key has a monotonously increasing prefix, such as timestamp. If the record key is completely random, it is better to turn this off, since range pruning will only add extra overhead to the index lookup. | | -| [hoodie.bloom.index.update.partition.path](#hoodiebloomindexupdatepartitionpath) | true (Optional) | Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition | | -| [hoodie.bloom.index.use.caching](#hoodiebloomindexusecaching) | true (Optional) | Only applies if index type is BLOOM.When true, the input RDD will cached to speed up index lookup by reducing IO for computing parallelism or affected partitions | | -| [hoodie.bloom.index.use.metadata](#hoodiebloomindexusemetadata) | false (Optional) | Only applies if index type is BLOOM.When true, the index lookup uses bloom filters and column stats from metadata table when available to speed up the process. | 0.11.0 | -| [hoodie.bloom.index.use.treebased.filter](#hoodiebloomindexusetreebasedfilter) | true (Optional) | Only applies if index type is BLOOM. When true, interval tree based file pruning optimization is enabled. This mode speeds-up file-pruning based on key ranges when compared with the brute-force mode | | -| [hoodie.bucket.index.merge.threshold](#hoodiebucketindexmergethreshold) | 0.2 (Optional) | Control if buckets should be merged when using consistent hashing bucket indexSpecifically, if a file slice size is smaller than `hoodie.xxxx.max.file.size` * threshold, then it will be consideredas a merge candidate. | 0.13.0 | -| [hoodie.bucket.index.num.buckets](#hoodiebucketindexnumbuckets) | 256 (Optional) | Only applies if index type is BUCKET. Determine the number of buckets in the hudi table, and each partition is divided to N buckets. | | -| [hoodie.bucket.index.split.threshold](#hoodiebucketindexsplitthreshold) | 2.0 (Optional) | Control if the bucket should be split when using consistent hashing bucket index.Specifically, if a file slice size reaches `hoodie.xxxx.max.file.size` * threshold, then split will be carried out. | 0.13.0 | -| [hoodie.global.index.reconcile.parallelism](#hoodieglobalindexreconcileparallelism) | 60 (Optional) | Only applies if index type is GLOBAL_BLOOM or GLOBAL_SIMPLE. This controls the parallelism for deduplication during indexing where more than 1 record could be tagged due to partition update. | | -| [hoodie.global.simple.index.parallelism](#hoodieglobalsimpleindexparallelism) | 100 (Optional) | Only applies if index type is GLOBAL_SIMPLE. This limits the parallelism of fetching records from the base files of all table partitions. The index picks the configured parallelism if the number of base files is larger than this configured value; otherwise, the number of base files is used as the parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance. | | -| [hoodie.index.bloom.fpp](#hoodieindexbloomfpp) | 0.000000001 (Optional) | Only applies if index type is BLOOM. Error rate allowed given the number of entries. This is used to calculate how many bits should be assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), we like to tradeoff disk space for lower false positives. If the number of entries added to bloom filter exceeds the configured value (hoodie.index.bloom.num_entries), then this fpp may not be honored. | | -| [hoodie.index.bloom.num_entries](#hoodieindexbloomnum_entries) | 60000 (Optional) | Only applies if index type is BLOOM. This is the number of entries to be stored in the bloom filter. The rationale for the default: Assume the maxParquetFileSize is 128MB and averageRecordSize is 1kb and hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. Warning: Setting this very low, will generate a lot of false positives and index lookup will have to scan a lot more files than it has to and setting this to a very high number will increase the size every base file linearly (roughly 4KB for every 50000 entries). This config is also used with DYNAMIC bloom filter which determines the initial size for the bloom. | | -| [hoodie.index.bucket.engine](#hoodieindexbucketengine) | SIMPLE (Optional) | org.apache.hudi.index.HoodieIndex$BucketIndexEngineType: Determines the type of bucketing or hashing to use when `hoodie.index.type` is set to `BUCKET`. SIMPLE(default): Uses a fixed number of buckets for file groups which cannot shrink or expand. This works for both COW and MOR tables. CONSISTENT_HASHING: Supports dynamic number of buckets with bucket resizing to properly size each bucket. This solves potential data skew problem where one bucket can be significantly larger than others in SIMPLE engine type. This only works with MOR tables. | 0.11.0 | -| [hoodie.index.class](#hoodieindexclass) | (Optional) | Full path of user-defined index class and must be a subclass of HoodieIndex class. It will take precedence over the hoodie.index.type configuration if specified | | -| [hoodie.record.index.input.storage.level](#hoodierecordindexinputstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Only applies when #recordIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values | | -| [hoodie.record.index.update.partition.path](#hoodierecordindexupdatepartitionpath) | false (Optional) | Similar to Key: 'hoodie.bloom.index.update.partition.path' , default: true , isAdvanced: true , description: Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition since version: version is not defined deprecated after: version is not defined), but for record index. | | -| [hoodie.record.index.use.caching](#hoodierecordindexusecaching) | true (Optional) | Only applies if index type is RECORD_INDEX.When true, the input RDD will be cached to speed up index lookup by reducing IO for computing parallelism or affected partitions | | -| [hoodie.simple.index.input.storage.level](#hoodiesimpleindexinputstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Only applies when #simpleIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values | | -| [hoodie.simple.index.parallelism](#hoodiesimpleindexparallelism) | 0 (Optional) | Only applies if index type is SIMPLE. This limits the parallelism of fetching records from the base files of affected partitions. By default, this is auto computed based on input workload characteristics. If the parallelism is explicitly configured by the user, the user-configured value is used in defining the actual parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance. | | -| [hoodie.simple.index.update.partition.path](#hoodiesimpleindexupdatepartitionpath) | true (Optional) | Similar to Key: 'hoodie.bloom.index.update.partition.path' , default: true , isAdvanced: true , description: Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition since version: version is not defined deprecated after: version is not defined), but for simple index. | | -| [hoodie.simple.index.use.caching](#hoodiesimpleindexusecaching) | true (Optional) | Only applies if index type is SIMPLE. When true, the incoming writes will cached to speed up index lookup by reducing IO for computing parallelism or affected partitions | | +| Config Name | Default | Description | +| ----------------------------------------------------------------------------------------- | ------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.bucket.index.hash.field](#hoodiebucketindexhashfield) | N/A **(Required)** | Index key. It is used to index the record and find its file group. If not set, use record key field as default

`Config Param: BUCKET_INDEX_HASH_FIELD` | +| [hoodie.bucket.index.max.num.buckets](#hoodiebucketindexmaxnumbuckets) | N/A **(Required)** | Only applies if bucket index engine is consistent hashing. Determine the upper bound of the number of buckets in the hudi table. Bucket resizing cannot be done higher than this max limit.

`Config Param: BUCKET_INDEX_MAX_NUM_BUCKETS`
`Since Version: 0.13.0` | +| [hoodie.bucket.index.min.num.buckets](#hoodiebucketindexminnumbuckets) | N/A **(Required)** | Only applies if bucket index engine is consistent hashing. Determine the lower bound of the number of buckets in the hudi table. Bucket resizing cannot be done lower than this min limit.

`Config Param: BUCKET_INDEX_MIN_NUM_BUCKETS`
`Since Version: 0.13.0` | +| [hoodie.bloom.index.bucketized.checking](#hoodiebloomindexbucketizedchecking) | true (Optional) | Only applies if index type is BLOOM. When true, bucketized bloom filtering is enabled. This reduces skew seen in sort based bloom index lookup

`Config Param: BLOOM_INDEX_BUCKETIZED_CHECKING` | +| [hoodie.bloom.index.filter.dynamic.max.entries](#hoodiebloomindexfilterdynamicmaxentries) | 100000 (Optional) | The threshold for the maximum number of keys to record in a dynamic Bloom filter row. Only applies if filter type is BloomFilterTypeCode.DYNAMIC_V0.

`Config Param: BLOOM_INDEX_FILTER_DYNAMIC_MAX_ENTRIES` | +| [hoodie.bloom.index.filter.type](#hoodiebloomindexfiltertype) | DYNAMIC_V0 (Optional) | org.apache.hudi.common.bloom.BloomFilterTypeCode: Filter type used by Bloom filter. SIMPLE: Bloom filter that is based on the configured size. DYNAMIC_V0(default): Bloom filter that is auto sized based on number of keys.

`Config Param: BLOOM_FILTER_TYPE` | +| [hoodie.bloom.index.input.storage.level](#hoodiebloomindexinputstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Only applies when #bloomIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values

`Config Param: BLOOM_INDEX_INPUT_STORAGE_LEVEL_VALUE` | +| [hoodie.bloom.index.keys.per.bucket](#hoodiebloomindexkeysperbucket) | 10000000 (Optional) | Only applies if bloomIndexBucketizedChecking is enabled and index type is bloom. This configuration controls the “bucket” size which tracks the number of record-key checks made against a single file and is the unit of work allocated to each partition performing bloom filter lookup. A higher value would amortize the fixed cost of reading a bloom filter to memory.

`Config Param: BLOOM_INDEX_KEYS_PER_BUCKET` | +| [hoodie.bloom.index.parallelism](#hoodiebloomindexparallelism) | 0 (Optional) | Only applies if index type is BLOOM. This is the amount of parallelism for index lookup, which involves a shuffle. By default, this is auto computed based on input workload characteristics. If the parallelism is explicitly configured by the user, the user-configured value is used in defining the actual parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance.

`Config Param: BLOOM_INDEX_PARALLELISM` | +| [hoodie.bloom.index.prune.by.ranges](#hoodiebloomindexprunebyranges) | true (Optional) | Only applies if index type is BLOOM. When true, range information from files to leveraged speed up index lookups. Particularly helpful, if the key has a monotonously increasing prefix, such as timestamp. If the record key is completely random, it is better to turn this off, since range pruning will only add extra overhead to the index lookup.

`Config Param: BLOOM_INDEX_PRUNE_BY_RANGES` | +| [hoodie.bloom.index.update.partition.path](#hoodiebloomindexupdatepartitionpath) | true (Optional) | Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition

`Config Param: BLOOM_INDEX_UPDATE_PARTITION_PATH_ENABLE` | +| [hoodie.bloom.index.use.caching](#hoodiebloomindexusecaching) | true (Optional) | Only applies if index type is BLOOM.When true, the input RDD will cached to speed up index lookup by reducing IO for computing parallelism or affected partitions

`Config Param: BLOOM_INDEX_USE_CACHING` | +| [hoodie.bloom.index.use.metadata](#hoodiebloomindexusemetadata) | false (Optional) | Only applies if index type is BLOOM.When true, the index lookup uses bloom filters and column stats from metadata table when available to speed up the process.

`Config Param: BLOOM_INDEX_USE_METADATA`
`Since Version: 0.11.0` | +| [hoodie.bloom.index.use.treebased.filter](#hoodiebloomindexusetreebasedfilter) | true (Optional) | Only applies if index type is BLOOM. When true, interval tree based file pruning optimization is enabled. This mode speeds-up file-pruning based on key ranges when compared with the brute-force mode

`Config Param: BLOOM_INDEX_TREE_BASED_FILTER` | +| [hoodie.bucket.index.merge.threshold](#hoodiebucketindexmergethreshold) | 0.2 (Optional) | Control if buckets should be merged when using consistent hashing bucket indexSpecifically, if a file slice size is smaller than `hoodie.xxxx.max.file.size` * threshold, then it will be consideredas a merge candidate.

`Config Param: BUCKET_MERGE_THRESHOLD`
`Since Version: 0.13.0` | +| [hoodie.bucket.index.num.buckets](#hoodiebucketindexnumbuckets) | 256 (Optional) | Only applies if index type is BUCKET. Determine the number of buckets in the hudi table, and each partition is divided to N buckets.

`Config Param: BUCKET_INDEX_NUM_BUCKETS` | +| [hoodie.bucket.index.split.threshold](#hoodiebucketindexsplitthreshold) | 2.0 (Optional) | Control if the bucket should be split when using consistent hashing bucket index.Specifically, if a file slice size reaches `hoodie.xxxx.max.file.size` * threshold, then split will be carried out.

`Config Param: BUCKET_SPLIT_THRESHOLD`
`Since Version: 0.13.0` | +| [hoodie.global.index.reconcile.parallelism](#hoodieglobalindexreconcileparallelism) | 60 (Optional) | Only applies if index type is GLOBAL_BLOOM or GLOBAL_SIMPLE. This controls the parallelism for deduplication during indexing where more than 1 record could be tagged due to partition update.

`Config Param: GLOBAL_INDEX_RECONCILE_PARALLELISM` | +| [hoodie.global.simple.index.parallelism](#hoodieglobalsimpleindexparallelism) | 100 (Optional) | Only applies if index type is GLOBAL_SIMPLE. This limits the parallelism of fetching records from the base files of all table partitions. The index picks the configured parallelism if the number of base files is larger than this configured value; otherwise, the number of base files is used as the parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance.

`Config Param: GLOBAL_SIMPLE_INDEX_PARALLELISM` | +| [hoodie.index.bloom.fpp](#hoodieindexbloomfpp) | 0.000000001 (Optional) | Only applies if index type is BLOOM. Error rate allowed given the number of entries. This is used to calculate how many bits should be assigned for the bloom filter and the number of hash functions. This is usually set very low (default: 0.000000001), we like to tradeoff disk space for lower false positives. If the number of entries added to bloom filter exceeds the configured value (hoodie.index.bloom.num_entries), then this fpp may not be honored.

`Config Param: BLOOM_FILTER_FPP_VALUE` | +| [hoodie.index.bloom.num_entries](#hoodieindexbloomnum_entries) | 60000 (Optional) | Only applies if index type is BLOOM. This is the number of entries to be stored in the bloom filter. The rationale for the default: Assume the maxParquetFileSize is 128MB and averageRecordSize is 1kb and hence we approx a total of 130K records in a file. The default (60000) is roughly half of this approximation. Warning: Setting this very low, will generate a lot of false positives and index lookup will have to scan a lot more files than it has to and setting this to a very high number will increase the size every base file linearly (roughly 4KB for every 50000 entries). This config is also used with DYNAMIC bloom filter which determines the initial size for the bloom.

`Config Param: BLOOM_FILTER_NUM_ENTRIES_VALUE` | +| [hoodie.index.bucket.engine](#hoodieindexbucketengine) | SIMPLE (Optional) | org.apache.hudi.index.HoodieIndex$BucketIndexEngineType: Determines the type of bucketing or hashing to use when `hoodie.index.type` is set to `BUCKET`. SIMPLE(default): Uses a fixed number of buckets for file groups which cannot shrink or expand. This works for both COW and MOR tables. CONSISTENT_HASHING: Supports dynamic number of buckets with bucket resizing to properly size each bucket. This solves potential data skew problem where one bucket can be significantly larger than others in SIMPLE engine type. This only works with MOR tables.

`Config Param: BUCKET_INDEX_ENGINE_TYPE`
`Since Version: 0.11.0` | +| [hoodie.index.class](#hoodieindexclass) | (Optional) | Full path of user-defined index class and must be a subclass of HoodieIndex class. It will take precedence over the hoodie.index.type configuration if specified

`Config Param: INDEX_CLASS_NAME` | +| [hoodie.record.index.input.storage.level](#hoodierecordindexinputstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Only applies when #recordIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values

`Config Param: RECORD_INDEX_INPUT_STORAGE_LEVEL_VALUE` | +| [hoodie.record.index.update.partition.path](#hoodierecordindexupdatepartitionpath) | false (Optional) | Similar to Key: 'hoodie.bloom.index.update.partition.path' , default: true , isAdvanced: true , description: Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition since version: version is not defined deprecated after: version is not defined), but for record index.

`Config Param: RECORD_INDEX_UPDATE_PARTITION_PATH_ENABLE` | +| [hoodie.record.index.use.caching](#hoodierecordindexusecaching) | true (Optional) | Only applies if index type is RECORD_INDEX.When true, the input RDD will be cached to speed up index lookup by reducing IO for computing parallelism or affected partitions

`Config Param: RECORD_INDEX_USE_CACHING` | +| [hoodie.simple.index.input.storage.level](#hoodiesimpleindexinputstoragelevel) | MEMORY_AND_DISK_SER (Optional) | Only applies when #simpleIndexUseCaching is set. Determine what level of persistence is used to cache input RDDs. Refer to org.apache.spark.storage.StorageLevel for different values

`Config Param: SIMPLE_INDEX_INPUT_STORAGE_LEVEL_VALUE` | +| [hoodie.simple.index.parallelism](#hoodiesimpleindexparallelism) | 0 (Optional) | Only applies if index type is SIMPLE. This limits the parallelism of fetching records from the base files of affected partitions. By default, this is auto computed based on input workload characteristics. If the parallelism is explicitly configured by the user, the user-configured value is used in defining the actual parallelism. If the indexing stage is slow due to the limited parallelism, you can increase this to tune the performance.

`Config Param: SIMPLE_INDEX_PARALLELISM` | +| [hoodie.simple.index.update.partition.path](#hoodiesimpleindexupdatepartitionpath) | true (Optional) | Similar to Key: 'hoodie.bloom.index.update.partition.path' , default: true , isAdvanced: true , description: Only applies if index type is GLOBAL_BLOOM. When set to true, an update including the partition path of a record that already exists will result in inserting the incoming record into the new partition and deleting the original record in the old partition. When set to false, the original record will only be updated in the old partition since version: version is not defined deprecated after: version is not defined), but for simple index.

`Config Param: SIMPLE_INDEX_UPDATE_PARTITION_PATH_ENABLE` | +| [hoodie.simple.index.use.caching](#hoodiesimpleindexusecaching) | true (Optional) | Only applies if index type is SIMPLE. When true, the incoming writes will cached to speed up index lookup by reducing IO for computing parallelism or affected partitions

`Config Param: SIMPLE_INDEX_USE_CACHING` | --- @@ -1054,35 +1054,35 @@ Configurations that control indexing behavior (when HBase based indexing is enab [**Advanced Configs**](#HBase-Index-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.index.hbase.kerberos.user.keytab](#hoodieindexhbasekerberosuserkeytab) | N/A **(Required)** | File name of the kerberos keytab file for connecting to the hbase cluster. | | -| [hoodie.index.hbase.kerberos.user.principal](#hoodieindexhbasekerberosuserprincipal) | N/A **(Required)** | The kerberos principal name for connecting to the hbase cluster. | | -| [hoodie.index.hbase.master.kerberos.principal](#hoodieindexhbasemasterkerberosprincipal) | N/A **(Required)** | The value of hbase.master.kerberos.principal in hbase cluster. | | -| [hoodie.index.hbase.max.qps.fraction](#hoodieindexhbasemaxqpsfraction) | N/A **(Required)** | Maximum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads | | -| [hoodie.index.hbase.min.qps.fraction](#hoodieindexhbaseminqpsfraction) | N/A **(Required)** | Minimum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads | | -| [hoodie.index.hbase.regionserver.kerberos.principal](#hoodieindexhbaseregionserverkerberosprincipal) | N/A **(Required)** | The value of hbase.regionserver.kerberos.principal in hbase cluster. | | -| [hoodie.index.hbase.sleep.ms.for.get.batch](#hoodieindexhbasesleepmsforgetbatch) | N/A **(Required)** | | | -| [hoodie.index.hbase.sleep.ms.for.put.batch](#hoodieindexhbasesleepmsforputbatch) | N/A **(Required)** | | | -| [hoodie.index.hbase.table](#hoodieindexhbasetable) | N/A **(Required)** | Only applies if index type is HBASE. HBase Table name to use as the index. Hudi stores the row_key and [partition_path, fileID, commitTime] mapping in the table | | -| [hoodie.index.hbase.zknode.path](#hoodieindexhbasezknodepath) | N/A **(Required)** | Only applies if index type is HBASE. This is the root znode that will contain all the znodes created/used by HBase | | -| [hoodie.index.hbase.zkport](#hoodieindexhbasezkport) | N/A **(Required)** | Only applies if index type is HBASE. HBase ZK Quorum port to connect to | | -| [hoodie.index.hbase.zkquorum](#hoodieindexhbasezkquorum) | N/A **(Required)** | Only applies if index type is HBASE. HBase ZK Quorum url to connect to | | -| [hoodie.hbase.index.update.partition.path](#hoodiehbaseindexupdatepartitionpath) | false (Optional) | Only applies if index type is HBASE. When an already existing record is upserted to a new partition compared to whats in storage, this config when set, will delete old record in old partition and will insert it as new record in new partition. | | -| [hoodie.index.hbase.bucket.number](#hoodieindexhbasebucketnumber) | 8 (Optional) | Only applicable when using RebalancedSparkHoodieHBaseIndex, same as hbase regions count can get the best performance | | -| [hoodie.index.hbase.desired_puts_time_in_secs](#hoodieindexhbasedesired_puts_time_in_secs) | 600 (Optional) | | | -| [hoodie.index.hbase.dynamic_qps](#hoodieindexhbasedynamic_qps) | false (Optional) | Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on write volume. | | -| [hoodie.index.hbase.get.batch.size](#hoodieindexhbasegetbatchsize) | 100 (Optional) | Controls the batch size for performing gets against HBase. Batching improves throughput, by saving round trips. | | -| [hoodie.index.hbase.max.qps.per.region.server](#hoodieindexhbasemaxqpsperregionserver) | 1000 (Optional) | Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this value based on global indexing throughput needs and most importantly, how much the HBase installation in use is able to tolerate without Region Servers going down. | | -| [hoodie.index.hbase.put.batch.size](#hoodieindexhbaseputbatchsize) | 100 (Optional) | Controls the batch size for performing puts against HBase. Batching improves throughput, by saving round trips. | | -| [hoodie.index.hbase.put.batch.size.autocompute](#hoodieindexhbaseputbatchsizeautocompute) | false (Optional) | Property to set to enable auto computation of put batch size | | -| [hoodie.index.hbase.qps.allocator.class](#hoodieindexhbaseqpsallocatorclass) | org.apache.hudi.index.hbase.DefaultHBaseQPSResourceAllocator (Optional) | Property to set which implementation of HBase QPS resource allocator to be used, whichcontrols the batching rate dynamically. | | -| [hoodie.index.hbase.qps.fraction](#hoodieindexhbaseqpsfraction) | 0.5 (Optional) | Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively. Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers. | | -| [hoodie.index.hbase.rollback.sync](#hoodieindexhbaserollbacksync) | false (Optional) | When set to true, the rollback method will delete the last failed task index. The default value is false. Because deleting the index will add extra load on the Hbase cluster for each rollback | | -| [hoodie.index.hbase.security.authentication](#hoodieindexhbasesecurityauthentication) | simple (Optional) | Property to decide if the hbase cluster secure authentication is enabled or not. Possible values are 'simple' (no authentication), and 'kerberos'. | | -| [hoodie.index.hbase.zk.connection_timeout_ms](#hoodieindexhbasezkconnection_timeout_ms) | 15000 (Optional) | Timeout to use for establishing connection with zookeeper, from HBase client. | | -| [hoodie.index.hbase.zk.session_timeout_ms](#hoodieindexhbasezksession_timeout_ms) | 60000 (Optional) | Session timeout value to use for Zookeeper failure detection, for the HBase client.Lower this value, if you want to fail faster. | | -| [hoodie.index.hbase.zkpath.qps_root](#hoodieindexhbasezkpathqps_root) | /QPS_ROOT (Optional) | chroot in zookeeper, to use for all qps allocation co-ordination. | | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.index.hbase.kerberos.user.keytab](#hoodieindexhbasekerberosuserkeytab) | N/A **(Required)** | File name of the kerberos keytab file for connecting to the hbase cluster.

`Config Param: KERBEROS_USER_KEYTAB` | +| [hoodie.index.hbase.kerberos.user.principal](#hoodieindexhbasekerberosuserprincipal) | N/A **(Required)** | The kerberos principal name for connecting to the hbase cluster.

`Config Param: KERBEROS_USER_PRINCIPAL` | +| [hoodie.index.hbase.master.kerberos.principal](#hoodieindexhbasemasterkerberosprincipal) | N/A **(Required)** | The value of hbase.master.kerberos.principal in hbase cluster.

`Config Param: MASTER_PRINCIPAL` | +| [hoodie.index.hbase.max.qps.fraction](#hoodieindexhbasemaxqpsfraction) | N/A **(Required)** | Maximum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads

`Config Param: MAX_QPS_FRACTION` | +| [hoodie.index.hbase.min.qps.fraction](#hoodieindexhbaseminqpsfraction) | N/A **(Required)** | Minimum for HBASE_QPS_FRACTION_PROP to stabilize skewed write workloads

`Config Param: MIN_QPS_FRACTION` | +| [hoodie.index.hbase.regionserver.kerberos.principal](#hoodieindexhbaseregionserverkerberosprincipal) | N/A **(Required)** | The value of hbase.regionserver.kerberos.principal in hbase cluster.

`Config Param: REGIONSERVER_PRINCIPAL` | +| [hoodie.index.hbase.sleep.ms.for.get.batch](#hoodieindexhbasesleepmsforgetbatch) | N/A **(Required)** |

`Config Param: SLEEP_MS_FOR_GET_BATCH` | +| [hoodie.index.hbase.sleep.ms.for.put.batch](#hoodieindexhbasesleepmsforputbatch) | N/A **(Required)** |

`Config Param: SLEEP_MS_FOR_PUT_BATCH` | +| [hoodie.index.hbase.table](#hoodieindexhbasetable) | N/A **(Required)** | Only applies if index type is HBASE. HBase Table name to use as the index. Hudi stores the row_key and [partition_path, fileID, commitTime] mapping in the table

`Config Param: TABLENAME` | +| [hoodie.index.hbase.zknode.path](#hoodieindexhbasezknodepath) | N/A **(Required)** | Only applies if index type is HBASE. This is the root znode that will contain all the znodes created/used by HBase

`Config Param: ZK_NODE_PATH` | +| [hoodie.index.hbase.zkport](#hoodieindexhbasezkport) | N/A **(Required)** | Only applies if index type is HBASE. HBase ZK Quorum port to connect to

`Config Param: ZKPORT` | +| [hoodie.index.hbase.zkquorum](#hoodieindexhbasezkquorum) | N/A **(Required)** | Only applies if index type is HBASE. HBase ZK Quorum url to connect to

`Config Param: ZKQUORUM` | +| [hoodie.hbase.index.update.partition.path](#hoodiehbaseindexupdatepartitionpath) | false (Optional) | Only applies if index type is HBASE. When an already existing record is upserted to a new partition compared to whats in storage, this config when set, will delete old record in old partition and will insert it as new record in new partition.

`Config Param: UPDATE_PARTITION_PATH_ENABLE` | +| [hoodie.index.hbase.bucket.number](#hoodieindexhbasebucketnumber) | 8 (Optional) | Only applicable when using RebalancedSparkHoodieHBaseIndex, same as hbase regions count can get the best performance

`Config Param: BUCKET_NUMBER` | +| [hoodie.index.hbase.desired_puts_time_in_secs](#hoodieindexhbasedesired_puts_time_in_secs) | 600 (Optional) |

`Config Param: DESIRED_PUTS_TIME_IN_SECONDS` | +| [hoodie.index.hbase.dynamic_qps](#hoodieindexhbasedynamic_qps) | false (Optional) | Property to decide if HBASE_QPS_FRACTION_PROP is dynamically calculated based on write volume.

`Config Param: COMPUTE_QPS_DYNAMICALLY` | +| [hoodie.index.hbase.get.batch.size](#hoodieindexhbasegetbatchsize) | 100 (Optional) | Controls the batch size for performing gets against HBase. Batching improves throughput, by saving round trips.

`Config Param: GET_BATCH_SIZE` | +| [hoodie.index.hbase.max.qps.per.region.server](#hoodieindexhbasemaxqpsperregionserver) | 1000 (Optional) | Property to set maximum QPS allowed per Region Server. This should be same across various jobs. This is intended to limit the aggregate QPS generated across various jobs to an Hbase Region Server. It is recommended to set this value based on global indexing throughput needs and most importantly, how much the HBase installation in use is able to tolerate without Region Servers going down.

`Config Param: MAX_QPS_PER_REGION_SERVER` | +| [hoodie.index.hbase.put.batch.size](#hoodieindexhbaseputbatchsize) | 100 (Optional) | Controls the batch size for performing puts against HBase. Batching improves throughput, by saving round trips.

`Config Param: PUT_BATCH_SIZE` | +| [hoodie.index.hbase.put.batch.size.autocompute](#hoodieindexhbaseputbatchsizeautocompute) | false (Optional) | Property to set to enable auto computation of put batch size

`Config Param: PUT_BATCH_SIZE_AUTO_COMPUTE` | +| [hoodie.index.hbase.qps.allocator.class](#hoodieindexhbaseqpsallocatorclass) | org.apache.hudi.index.hbase.DefaultHBaseQPSResourceAllocator (Optional) | Property to set which implementation of HBase QPS resource allocator to be used, whichcontrols the batching rate dynamically.

`Config Param: QPS_ALLOCATOR_CLASS_NAME` | +| [hoodie.index.hbase.qps.fraction](#hoodieindexhbaseqpsfraction) | 0.5 (Optional) | Property to set the fraction of the global share of QPS that should be allocated to this job. Let's say there are 3 jobs which have input size in terms of number of rows required for HbaseIndexing as x, 2x, 3x respectively. Then this fraction for the jobs would be (0.17) 1/6, 0.33 (2/6) and 0.5 (3/6) respectively. Default is 50%, which means a total of 2 jobs can run using HbaseIndex without overwhelming Region Servers.

`Config Param: QPS_FRACTION` | +| [hoodie.index.hbase.rollback.sync](#hoodieindexhbaserollbacksync) | false (Optional) | When set to true, the rollback method will delete the last failed task index. The default value is false. Because deleting the index will add extra load on the Hbase cluster for each rollback

`Config Param: ROLLBACK_SYNC_ENABLE` | +| [hoodie.index.hbase.security.authentication](#hoodieindexhbasesecurityauthentication) | simple (Optional) | Property to decide if the hbase cluster secure authentication is enabled or not. Possible values are 'simple' (no authentication), and 'kerberos'.

`Config Param: SECURITY_AUTHENTICATION` | +| [hoodie.index.hbase.zk.connection_timeout_ms](#hoodieindexhbasezkconnection_timeout_ms) | 15000 (Optional) | Timeout to use for establishing connection with zookeeper, from HBase client.

`Config Param: ZK_CONNECTION_TIMEOUT_MS` | +| [hoodie.index.hbase.zk.session_timeout_ms](#hoodieindexhbasezksession_timeout_ms) | 60000 (Optional) | Session timeout value to use for Zookeeper failure detection, for the HBase client.Lower this value, if you want to fail faster.

`Config Param: ZK_SESSION_TIMEOUT_MS` | +| [hoodie.index.hbase.zkpath.qps_root](#hoodieindexhbasezkpathqps_root) | /QPS_ROOT (Optional) | chroot in zookeeper, to use for all qps allocation co-ordination.

`Config Param: ZKPATH_QPS_ROOT` | --- ## Metastore and Catalog Sync Configs {#META_SYNC} @@ -1097,28 +1097,28 @@ Configurations used by the Hudi to sync metadata to external metastores and cata [**Basic Configs**](#Common-Metadata-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | [**Advanced Configs**](#Common-Metadata-Sync-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd | | -| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync. | | -| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'. | | -| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns. | | -| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to. | | -| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync | | -| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change. | | -| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false. | | -| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost. | 0.14.0 | -| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores | | -| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable | 0.14.0 | -| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd

`Config Param: META_SYNC_ASSUME_DATE_PARTITION` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync.

`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to.

`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.

`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns.

`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to.

`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync

`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change.

`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.

`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.

`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores

`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable

`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore.

`Config Param: META_SYNC_SPARK_VERSION` | --- @@ -1130,10 +1130,10 @@ Configs that control Glue catalog sync based client. [**Advanced Configs**](#Glue-catalog-sync-based-client-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------ | ---------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.meta.sync.glue.metadata_file_listing](#hoodiedatasourcemetasyncgluemetadata_file_listing) | false (Optional) | Makes athena use the metadata table to list partitions and files. Currently it won't benefit from other features such stats indexes | 0.14.0 | -| [hoodie.datasource.meta.sync.glue.skip_table_archive](#hoodiedatasourcemetasyncglueskip_table_archive) | true (Optional) | Glue catalog sync based client will skip archiving the table version if this config is set to true | 0.14.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------ | ---------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.glue.metadata_file_listing](#hoodiedatasourcemetasyncgluemetadata_file_listing) | false (Optional) | Makes athena use the metadata table to list partitions and files. Currently it won't benefit from other features such stats indexes

`Config Param: GLUE_METADATA_FILE_LISTING`
`Since Version: 0.14.0` | +| [hoodie.datasource.meta.sync.glue.skip_table_archive](#hoodiedatasourcemetasyncglueskip_table_archive) | true (Optional) | Glue catalog sync based client will skip archiving the table version if this config is set to true

`Config Param: GLUE_SKIP_TABLE_ARCHIVE`
`Since Version: 0.14.0` | --- @@ -1145,39 +1145,39 @@ Configurations used by the Hudi to sync metadata to Google BigQuery. [**Basic Configs**](#BigQuery-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | [**Advanced Configs**](#BigQuery-Sync-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.gcp.bigquery.sync.dataset_location](#hoodiegcpbigquerysyncdataset_location) | N/A **(Required)** | Location of the target dataset in BigQuery | | -| [hoodie.gcp.bigquery.sync.project_id](#hoodiegcpbigquerysyncproject_id) | N/A **(Required)** | Name of the target project in BigQuery | | -| [hoodie.gcp.bigquery.sync.source_uri](#hoodiegcpbigquerysyncsource_uri) | N/A **(Required)** | Name of the source uri gcs path of the table | | -| [hoodie.gcp.bigquery.sync.source_uri_prefix](#hoodiegcpbigquerysyncsource_uri_prefix) | N/A **(Required)** | Name of the source uri gcs path prefix of the table | | -| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd | | -| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync. | | -| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'. | | -| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns. | | -| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to. | | -| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync | | -| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change. | | -| [hoodie.gcp.bigquery.sync.assume_date_partitioning](#hoodiegcpbigquerysyncassume_date_partitioning) | false (Optional) | Assume standard yyyy/mm/dd partitioning, this exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter | | -| [hoodie.gcp.bigquery.sync.base_path](#hoodiegcpbigquerysyncbase_path) | (Optional) | Base path of the hoodie table to sync | | -| [hoodie.gcp.bigquery.sync.dataset_name](#hoodiegcpbigquerysyncdataset_name) | (Optional) | Name of the target dataset in BigQuery | | -| [hoodie.gcp.bigquery.sync.partition_fields](#hoodiegcpbigquerysyncpartition_fields) | (Optional) | Comma-delimited partition fields. Default to non-partitioned. | | -| [hoodie.gcp.bigquery.sync.table_name](#hoodiegcpbigquerysynctable_name) | (Optional) | Name of the target table in BigQuery | | -| [hoodie.gcp.bigquery.sync.use_bq_manifest_file](#hoodiegcpbigquerysyncuse_bq_manifest_file) | false (Optional) | If true, generate a manifest file with data file absolute paths and use BigQuery manifest file support to directly create one external table over the Hudi table. If false (default), generate a manifest file with data file names and create two external tables and one view in BigQuery. Query the view for the same results as querying the Hudi table | | -| [hoodie.gcp.bigquery.sync.use_file_listing_from_metadata](#hoodiegcpbigquerysyncuse_file_listing_from_metadata) | false (Optional) | Fetch file listing from Hudi's metadata | | -| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false. | | -| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost. | 0.14.0 | -| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores | | -| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable | 0.14.0 | -| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.gcp.bigquery.sync.dataset_location](#hoodiegcpbigquerysyncdataset_location) | N/A **(Required)** | Location of the target dataset in BigQuery

`Config Param: BIGQUERY_SYNC_DATASET_LOCATION` | +| [hoodie.gcp.bigquery.sync.project_id](#hoodiegcpbigquerysyncproject_id) | N/A **(Required)** | Name of the target project in BigQuery

`Config Param: BIGQUERY_SYNC_PROJECT_ID` | +| [hoodie.gcp.bigquery.sync.source_uri](#hoodiegcpbigquerysyncsource_uri) | N/A **(Required)** | Name of the source uri gcs path of the table

`Config Param: BIGQUERY_SYNC_SOURCE_URI` | +| [hoodie.gcp.bigquery.sync.source_uri_prefix](#hoodiegcpbigquerysyncsource_uri_prefix) | N/A **(Required)** | Name of the source uri gcs path prefix of the table

`Config Param: BIGQUERY_SYNC_SOURCE_URI_PREFIX` | +| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd

`Config Param: META_SYNC_ASSUME_DATE_PARTITION` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync.

`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to.

`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.

`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns.

`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to.

`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync

`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change.

`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.gcp.bigquery.sync.assume_date_partitioning](#hoodiegcpbigquerysyncassume_date_partitioning) | false (Optional) | Assume standard yyyy/mm/dd partitioning, this exists to support backward compatibility. If you use hoodie 0.3.x, do not set this parameter

`Config Param: BIGQUERY_SYNC_ASSUME_DATE_PARTITIONING` | +| [hoodie.gcp.bigquery.sync.base_path](#hoodiegcpbigquerysyncbase_path) | (Optional) | Base path of the hoodie table to sync

`Config Param: BIGQUERY_SYNC_SYNC_BASE_PATH` | +| [hoodie.gcp.bigquery.sync.dataset_name](#hoodiegcpbigquerysyncdataset_name) | (Optional) | Name of the target dataset in BigQuery

`Config Param: BIGQUERY_SYNC_DATASET_NAME` | +| [hoodie.gcp.bigquery.sync.partition_fields](#hoodiegcpbigquerysyncpartition_fields) | (Optional) | Comma-delimited partition fields. Default to non-partitioned.

`Config Param: BIGQUERY_SYNC_PARTITION_FIELDS` | +| [hoodie.gcp.bigquery.sync.table_name](#hoodiegcpbigquerysynctable_name) | (Optional) | Name of the target table in BigQuery

`Config Param: BIGQUERY_SYNC_TABLE_NAME` | +| [hoodie.gcp.bigquery.sync.use_bq_manifest_file](#hoodiegcpbigquerysyncuse_bq_manifest_file) | false (Optional) | If true, generate a manifest file with data file absolute paths and use BigQuery manifest file support to directly create one external table over the Hudi table. If false (default), generate a manifest file with data file names and create two external tables and one view in BigQuery. Query the view for the same results as querying the Hudi table

`Config Param: BIGQUERY_SYNC_USE_BQ_MANIFEST_FILE` | +| [hoodie.gcp.bigquery.sync.use_file_listing_from_metadata](#hoodiegcpbigquerysyncuse_file_listing_from_metadata) | false (Optional) | Fetch file listing from Hudi's metadata

`Config Param: BIGQUERY_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.

`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.

`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores

`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable

`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore.

`Config Param: META_SYNC_SPARK_VERSION` | --- @@ -1189,53 +1189,53 @@ Configurations used by the Hudi to sync metadata to Hive Metastore. [**Basic Configs**](#Hive-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------- | --------------------------------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql. | | -| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore. | | -| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url | | -| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url | | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.

`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore.

`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url

`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url

`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | [**Advanced Configs**](#Hive-Sync-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | N/A **(Required)** | Serde properties to hive table. | | -| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | N/A **(Required)** | Additional properties to store with table. | | -| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd | | -| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true (Optional) | Auto create hive database if does not exists | | -| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync. | | -| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 (Optional) | The number of partitions one batch when synchronous partitions to hive. | | -| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false (Optional) | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS' | | -| [hoodie.datasource.hive_sync.bucket_sync_spec](#hoodiedatasourcehive_syncbucket_sync_spec) | (Optional) | The hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS' | | -| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false (Optional) | Whether to sync the table as managed table. | | -| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.filter_pushdown_enabled](#hoodiedatasourcehive_syncfilter_pushdown_enabled) | false (Optional) | Whether to enable push down partitions by filter | | -| [hoodie.datasource.hive_sync.filter_pushdown_max_size](#hoodiedatasourcehive_syncfilter_pushdown_max_size) | 1000 (Optional) | Max size limit to push down partition filters, if the estimate push down filters exceed this size, will directly try to fetch all partitions | | -| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false (Optional) | Ignore exceptions when syncing with Hive. | | -| [hoodie.datasource.hive_sync.omit_metadata_fields](#hoodiedatasourcehive_syncomit_metadata_fields) | false (Optional) | Whether to omit the hoodie metadata fields in the target table. | 0.13.0 | -| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'. | | -| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns. | | -| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive (Optional) | hive password to use | | -| [hoodie.datasource.hive_sync.schema_string_length_thresh](#hoodiedatasourcehive_syncschema_string_length_thresh) | 4000 (Optional) | | | -| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false (Optional) | Skip the _ro suffix for Read optimized table, when registering | | -| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false (Optional) | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility. | | -| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true (Optional) | | | -| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false (Optional) | Whether to sync the table column comments while syncing the table. | | -| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.table.strategy](#hoodiedatasourcehive_synctablestrategy) | ALL (Optional) | Hive table synchronization strategy. Available option: RO, RT, ALL. | 0.13.0 | -| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true (Optional) | Use JDBC when hive synchronization is enabled | | -| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false (Optional) | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format | | -| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive (Optional) | hive user name to use | | -| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync | | -| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change. | | -| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false. | | -| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost. | 0.14.0 | -| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores | | -| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable | 0.14.0 | -| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore. | | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | N/A **(Required)** | Serde properties to hive table.

`Config Param: HIVE_TABLE_SERDE_PROPERTIES` | +| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | N/A **(Required)** | Additional properties to store with table.

`Config Param: HIVE_TABLE_PROPERTIES` | +| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd

`Config Param: META_SYNC_ASSUME_DATE_PARTITION` | +| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true (Optional) | Auto create hive database if does not exists

`Config Param: HIVE_AUTO_CREATE_DATABASE` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync.

`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 (Optional) | The number of partitions one batch when synchronous partitions to hive.

`Config Param: HIVE_BATCH_SYNC_PARTITION_NUM` | +| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false (Optional) | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'

`Config Param: HIVE_SYNC_BUCKET_SYNC` | +| [hoodie.datasource.hive_sync.bucket_sync_spec](#hoodiedatasourcehive_syncbucket_sync_spec) | (Optional) | The hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'

`Config Param: HIVE_SYNC_BUCKET_SYNC_SPEC` | +| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false (Optional) | Whether to sync the table as managed table.

`Config Param: HIVE_CREATE_MANAGED_TABLE` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to.

`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.filter_pushdown_enabled](#hoodiedatasourcehive_syncfilter_pushdown_enabled) | false (Optional) | Whether to enable push down partitions by filter

`Config Param: HIVE_SYNC_FILTER_PUSHDOWN_ENABLED` | +| [hoodie.datasource.hive_sync.filter_pushdown_max_size](#hoodiedatasourcehive_syncfilter_pushdown_max_size) | 1000 (Optional) | Max size limit to push down partition filters, if the estimate push down filters exceed this size, will directly try to fetch all partitions

`Config Param: HIVE_SYNC_FILTER_PUSHDOWN_MAX_SIZE` | +| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false (Optional) | Ignore exceptions when syncing with Hive.

`Config Param: HIVE_IGNORE_EXCEPTIONS` | +| [hoodie.datasource.hive_sync.omit_metadata_fields](#hoodiedatasourcehive_syncomit_metadata_fields) | false (Optional) | Whether to omit the hoodie metadata fields in the target table.

`Config Param: HIVE_SYNC_OMIT_METADATA_FIELDS`
`Since Version: 0.13.0` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.

`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns.

`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive (Optional) | hive password to use

`Config Param: HIVE_PASS` | +| [hoodie.datasource.hive_sync.schema_string_length_thresh](#hoodiedatasourcehive_syncschema_string_length_thresh) | 4000 (Optional) |

`Config Param: HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD` | +| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false (Optional) | Skip the _ro suffix for Read optimized table, when registering

`Config Param: HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE` | +| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false (Optional) | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility.

`Config Param: HIVE_SUPPORT_TIMESTAMP_TYPE` | +| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true (Optional) |

`Config Param: HIVE_SYNC_AS_DATA_SOURCE_TABLE` | +| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false (Optional) | Whether to sync the table column comments while syncing the table.

`Config Param: HIVE_SYNC_COMMENT` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to.

`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.hive_sync.table.strategy](#hoodiedatasourcehive_synctablestrategy) | ALL (Optional) | Hive table synchronization strategy. Available option: RO, RT, ALL.

`Config Param: HIVE_SYNC_TABLE_STRATEGY`
`Since Version: 0.13.0` | +| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true (Optional) | Use JDBC when hive synchronization is enabled

`Config Param: HIVE_USE_JDBC` | +| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false (Optional) | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format

`Config Param: HIVE_USE_PRE_APACHE_INPUT_FORMAT` | +| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive (Optional) | hive user name to use

`Config Param: HIVE_USER` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync

`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change.

`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.

`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.

`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores

`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable

`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore.

`Config Param: META_SYNC_SPARK_VERSION` | --- @@ -1247,54 +1247,54 @@ Global replication configurations used by the Hudi to sync metadata to Hive Meta [**Basic Configs**](#Global-Hive-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------- | --------------------------------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql. | | -| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore. | | -| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url | | -| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url | | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | --------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.mode](#hoodiedatasourcehive_syncmode) | N/A **(Required)** | Mode to choose for Hive ops. Valid values are hms, jdbc and hiveql.

`Config Param: HIVE_SYNC_MODE` | +| [hoodie.datasource.hive_sync.enable](#hoodiedatasourcehive_syncenable) | false (Optional) | When set to true, register/sync the table to Apache Hive metastore.

`Config Param: HIVE_SYNC_ENABLED` | +| [hoodie.datasource.hive_sync.jdbcurl](#hoodiedatasourcehive_syncjdbcurl) | jdbc:hive2://localhost:10000 (Optional) | Hive metastore url

`Config Param: HIVE_URL` | +| [hoodie.datasource.hive_sync.metastore.uris](#hoodiedatasourcehive_syncmetastoreuris) | thrift://localhost:9083 (Optional) | Hive metastore url

`Config Param: METASTORE_URIS` | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | [**Advanced Configs**](#Global-Hive-Sync-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | N/A **(Required)** | Serde properties to hive table. | | -| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | N/A **(Required)** | Additional properties to store with table. | | -| [hoodie.meta_sync.global.replicate.timestamp](#hoodiemeta_syncglobalreplicatetimestamp) | N/A **(Required)** | | | -| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd | | -| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true (Optional) | Auto create hive database if does not exists | | -| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync. | | -| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 (Optional) | The number of partitions one batch when synchronous partitions to hive. | | -| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false (Optional) | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS' | | -| [hoodie.datasource.hive_sync.bucket_sync_spec](#hoodiedatasourcehive_syncbucket_sync_spec) | (Optional) | The hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS' | | -| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false (Optional) | Whether to sync the table as managed table. | | -| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.filter_pushdown_enabled](#hoodiedatasourcehive_syncfilter_pushdown_enabled) | false (Optional) | Whether to enable push down partitions by filter | | -| [hoodie.datasource.hive_sync.filter_pushdown_max_size](#hoodiedatasourcehive_syncfilter_pushdown_max_size) | 1000 (Optional) | Max size limit to push down partition filters, if the estimate push down filters exceed this size, will directly try to fetch all partitions | | -| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false (Optional) | Ignore exceptions when syncing with Hive. | | -| [hoodie.datasource.hive_sync.omit_metadata_fields](#hoodiedatasourcehive_syncomit_metadata_fields) | false (Optional) | Whether to omit the hoodie metadata fields in the target table. | 0.13.0 | -| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'. | | -| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns. | | -| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive (Optional) | hive password to use | | -| [hoodie.datasource.hive_sync.schema_string_length_thresh](#hoodiedatasourcehive_syncschema_string_length_thresh) | 4000 (Optional) | | | -| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false (Optional) | Skip the _ro suffix for Read optimized table, when registering | | -| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false (Optional) | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility. | | -| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true (Optional) | | | -| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false (Optional) | Whether to sync the table column comments while syncing the table. | | -| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.table.strategy](#hoodiedatasourcehive_synctablestrategy) | ALL (Optional) | Hive table synchronization strategy. Available option: RO, RT, ALL. | 0.13.0 | -| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true (Optional) | Use JDBC when hive synchronization is enabled | | -| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false (Optional) | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format | | -| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive (Optional) | hive user name to use | | -| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync | | -| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change. | | -| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false. | | -| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost. | 0.14.0 | -| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores | | -| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable | 0.14.0 | -| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore. | | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.hive_sync.serde_properties](#hoodiedatasourcehive_syncserde_properties) | N/A **(Required)** | Serde properties to hive table.

`Config Param: HIVE_TABLE_SERDE_PROPERTIES` | +| [hoodie.datasource.hive_sync.table_properties](#hoodiedatasourcehive_synctable_properties) | N/A **(Required)** | Additional properties to store with table.

`Config Param: HIVE_TABLE_PROPERTIES` | +| [hoodie.meta_sync.global.replicate.timestamp](#hoodiemeta_syncglobalreplicatetimestamp) | N/A **(Required)** |

`Config Param: META_SYNC_GLOBAL_REPLICATE_TIMESTAMP` | +| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd

`Config Param: META_SYNC_ASSUME_DATE_PARTITION` | +| [hoodie.datasource.hive_sync.auto_create_database](#hoodiedatasourcehive_syncauto_create_database) | true (Optional) | Auto create hive database if does not exists

`Config Param: HIVE_AUTO_CREATE_DATABASE` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync.

`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.batch_num](#hoodiedatasourcehive_syncbatch_num) | 1000 (Optional) | The number of partitions one batch when synchronous partitions to hive.

`Config Param: HIVE_BATCH_SYNC_PARTITION_NUM` | +| [hoodie.datasource.hive_sync.bucket_sync](#hoodiedatasourcehive_syncbucket_sync) | false (Optional) | Whether sync hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'

`Config Param: HIVE_SYNC_BUCKET_SYNC` | +| [hoodie.datasource.hive_sync.bucket_sync_spec](#hoodiedatasourcehive_syncbucket_sync_spec) | (Optional) | The hive metastore bucket specification when using bucket index.The specification is 'CLUSTERED BY (trace_id) SORTED BY (trace_id ASC) INTO 65536 BUCKETS'

`Config Param: HIVE_SYNC_BUCKET_SYNC_SPEC` | +| [hoodie.datasource.hive_sync.create_managed_table](#hoodiedatasourcehive_synccreate_managed_table) | false (Optional) | Whether to sync the table as managed table.

`Config Param: HIVE_CREATE_MANAGED_TABLE` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to.

`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.filter_pushdown_enabled](#hoodiedatasourcehive_syncfilter_pushdown_enabled) | false (Optional) | Whether to enable push down partitions by filter

`Config Param: HIVE_SYNC_FILTER_PUSHDOWN_ENABLED` | +| [hoodie.datasource.hive_sync.filter_pushdown_max_size](#hoodiedatasourcehive_syncfilter_pushdown_max_size) | 1000 (Optional) | Max size limit to push down partition filters, if the estimate push down filters exceed this size, will directly try to fetch all partitions

`Config Param: HIVE_SYNC_FILTER_PUSHDOWN_MAX_SIZE` | +| [hoodie.datasource.hive_sync.ignore_exceptions](#hoodiedatasourcehive_syncignore_exceptions) | false (Optional) | Ignore exceptions when syncing with Hive.

`Config Param: HIVE_IGNORE_EXCEPTIONS` | +| [hoodie.datasource.hive_sync.omit_metadata_fields](#hoodiedatasourcehive_syncomit_metadata_fields) | false (Optional) | Whether to omit the hoodie metadata fields in the target table.

`Config Param: HIVE_SYNC_OMIT_METADATA_FIELDS`
`Since Version: 0.13.0` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.

`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns.

`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.password](#hoodiedatasourcehive_syncpassword) | hive (Optional) | hive password to use

`Config Param: HIVE_PASS` | +| [hoodie.datasource.hive_sync.schema_string_length_thresh](#hoodiedatasourcehive_syncschema_string_length_thresh) | 4000 (Optional) |

`Config Param: HIVE_SYNC_SCHEMA_STRING_LENGTH_THRESHOLD` | +| [hoodie.datasource.hive_sync.skip_ro_suffix](#hoodiedatasourcehive_syncskip_ro_suffix) | false (Optional) | Skip the _ro suffix for Read optimized table, when registering

`Config Param: HIVE_SKIP_RO_SUFFIX_FOR_READ_OPTIMIZED_TABLE` | +| [hoodie.datasource.hive_sync.support_timestamp](#hoodiedatasourcehive_syncsupport_timestamp) | false (Optional) | ‘INT64’ with original type TIMESTAMP_MICROS is converted to hive ‘timestamp’ type. Disabled by default for backward compatibility.

`Config Param: HIVE_SUPPORT_TIMESTAMP_TYPE` | +| [hoodie.datasource.hive_sync.sync_as_datasource](#hoodiedatasourcehive_syncsync_as_datasource) | true (Optional) |

`Config Param: HIVE_SYNC_AS_DATA_SOURCE_TABLE` | +| [hoodie.datasource.hive_sync.sync_comment](#hoodiedatasourcehive_syncsync_comment) | false (Optional) | Whether to sync the table column comments while syncing the table.

`Config Param: HIVE_SYNC_COMMENT` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to.

`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.hive_sync.table.strategy](#hoodiedatasourcehive_synctablestrategy) | ALL (Optional) | Hive table synchronization strategy. Available option: RO, RT, ALL.

`Config Param: HIVE_SYNC_TABLE_STRATEGY`
`Since Version: 0.13.0` | +| [hoodie.datasource.hive_sync.use_jdbc](#hoodiedatasourcehive_syncuse_jdbc) | true (Optional) | Use JDBC when hive synchronization is enabled

`Config Param: HIVE_USE_JDBC` | +| [hoodie.datasource.hive_sync.use_pre_apache_input_format](#hoodiedatasourcehive_syncuse_pre_apache_input_format) | false (Optional) | Flag to choose InputFormat under com.uber.hoodie package instead of org.apache.hudi package. Use this when you are in the process of migrating from com.uber.hoodie to org.apache.hudi. Stop using this after you migrated the table definition to org.apache.hudi input format

`Config Param: HIVE_USE_PRE_APACHE_INPUT_FORMAT` | +| [hoodie.datasource.hive_sync.username](#hoodiedatasourcehive_syncusername) | hive (Optional) | hive user name to use

`Config Param: HIVE_USER` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync

`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change.

`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.

`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.

`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores

`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable

`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore.

`Config Param: META_SYNC_SPARK_VERSION` | --- @@ -1306,34 +1306,34 @@ Configurations used by the Hudi to sync metadata to DataHub. [**Basic Configs**](#DataHub-Sync-Configs-basic-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------- | ---------------- | -------------------------------------------------------------------------- | ------------- | -| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog. | | +| Config Name | Default | Description | +| --------------------------------------------------------------------- | ---------------- | ----------------------------------------------------------------------------------------------------------------------- | +| [hoodie.datasource.meta.sync.enable](#hoodiedatasourcemetasyncenable) | false (Optional) | Enable Syncing the Hudi Table with an external meta store or data catalog.

`Config Param: META_SYNC_ENABLED` | [**Advanced Configs**](#DataHub-Sync-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.meta.sync.datahub.emitter.server](#hoodiemetasyncdatahubemitterserver) | N/A **(Required)** | Server URL of the DataHub instance. | | -| [hoodie.meta.sync.datahub.emitter.supplier.class](#hoodiemetasyncdatahubemittersupplierclass) | N/A **(Required)** | Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs. | | -| [hoodie.meta.sync.datahub.emitter.token](#hoodiemetasyncdatahubemittertoken) | N/A **(Required)** | Auth token to connect to the DataHub instance. | | -| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd | | -| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync. | | -| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to. | | -| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'. | | -| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns. | | -| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to. | | -| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync | | -| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change. | | -| [hoodie.meta.sync.datahub.dataplatform.name](#hoodiemetasyncdatahubdataplatformname) | hudi (Optional) | String used to represent Hudi when creating its corresponding DataPlatform entity within Datahub | | -| [hoodie.meta.sync.datahub.dataset.env](#hoodiemetasyncdatahubdatasetenv) | DEV (Optional) | Environment to use when pushing entities to Datahub | | -| [hoodie.meta.sync.datahub.dataset.identifier.class](#hoodiemetasyncdatahubdatasetidentifierclass) | org.apache.hudi.sync.datahub.config.HoodieDataHubDatasetIdentifier (Optional) | Pluggable class to help provide info to identify a DataHub Dataset. | | -| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false. | | -| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost. | 0.14.0 | -| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores | | -| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable | 0.14.0 | -| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore. | | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------------------ | ----------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.meta.sync.datahub.emitter.server](#hoodiemetasyncdatahubemitterserver) | N/A **(Required)** | Server URL of the DataHub instance.

`Config Param: META_SYNC_DATAHUB_EMITTER_SERVER` | +| [hoodie.meta.sync.datahub.emitter.supplier.class](#hoodiemetasyncdatahubemittersupplierclass) | N/A **(Required)** | Pluggable class to supply a DataHub REST emitter to connect to the DataHub instance. This overwrites other emitter configs.

`Config Param: META_SYNC_DATAHUB_EMITTER_SUPPLIER_CLASS` | +| [hoodie.meta.sync.datahub.emitter.token](#hoodiemetasyncdatahubemittertoken) | N/A **(Required)** | Auth token to connect to the DataHub instance.

`Config Param: META_SYNC_DATAHUB_EMITTER_TOKEN` | +| [hoodie.datasource.hive_sync.assume_date_partitioning](#hoodiedatasourcehive_syncassume_date_partitioning) | false (Optional) | Assume partitioning is yyyy/MM/dd

`Config Param: META_SYNC_ASSUME_DATE_PARTITION` | +| [hoodie.datasource.hive_sync.base_file_format](#hoodiedatasourcehive_syncbase_file_format) | PARQUET (Optional) | Base file format for the sync.

`Config Param: META_SYNC_BASE_FILE_FORMAT` | +| [hoodie.datasource.hive_sync.database](#hoodiedatasourcehive_syncdatabase) | default (Optional) | The name of the destination database that we should sync the hudi table to.

`Config Param: META_SYNC_DATABASE_NAME` | +| [hoodie.datasource.hive_sync.partition_extractor_class](#hoodiedatasourcehive_syncpartition_extractor_class) | org.apache.hudi.hive.MultiPartKeysValueExtractor (Optional) | Class which implements PartitionValueExtractor to extract the partition values, default 'org.apache.hudi.hive.MultiPartKeysValueExtractor'.

`Config Param: META_SYNC_PARTITION_EXTRACTOR_CLASS` | +| [hoodie.datasource.hive_sync.partition_fields](#hoodiedatasourcehive_syncpartition_fields) | (Optional) | Field in the table to use for determining hive partition columns.

`Config Param: META_SYNC_PARTITION_FIELDS` | +| [hoodie.datasource.hive_sync.table](#hoodiedatasourcehive_synctable) | unknown (Optional) | The name of the destination table that we should sync the hudi table to.

`Config Param: META_SYNC_TABLE_NAME` | +| [hoodie.datasource.meta.sync.base.path](#hoodiedatasourcemetasyncbasepath) | (Optional) | Base path of the hoodie table to sync

`Config Param: META_SYNC_BASE_PATH` | +| [hoodie.datasource.meta_sync.condition.sync](#hoodiedatasourcemeta_syncconditionsync) | false (Optional) | If true, only sync on conditions like schema change or partition change.

`Config Param: META_SYNC_CONDITIONAL_SYNC` | +| [hoodie.meta.sync.datahub.dataplatform.name](#hoodiemetasyncdatahubdataplatformname) | hudi (Optional) | String used to represent Hudi when creating its corresponding DataPlatform entity within Datahub

`Config Param: META_SYNC_DATAHUB_DATAPLATFORM_NAME` | +| [hoodie.meta.sync.datahub.dataset.env](#hoodiemetasyncdatahubdatasetenv) | DEV (Optional) | Environment to use when pushing entities to Datahub

`Config Param: META_SYNC_DATAHUB_DATASET_ENV` | +| [hoodie.meta.sync.datahub.dataset.identifier.class](#hoodiemetasyncdatahubdatasetidentifierclass) | org.apache.hudi.sync.datahub.config.HoodieDataHubDatasetIdentifier (Optional) | Pluggable class to help provide info to identify a DataHub Dataset.

`Config Param: META_SYNC_DATAHUB_DATASET_IDENTIFIER_CLASS` | +| [hoodie.meta.sync.decode_partition](#hoodiemetasyncdecode_partition) | false (Optional) | If true, meta sync will url-decode the partition path, as it is deemed as url-encoded. Default to false.

`Config Param: META_SYNC_DECODE_PARTITION` | +| [hoodie.meta.sync.incremental](#hoodiemetasyncincremental) | true (Optional) | Whether to incrementally sync the partitions to the metastore, i.e., only added, changed, and deleted partitions based on the commit metadata. If set to `false`, the meta sync executes a full partition sync operation when partitions are lost.

`Config Param: META_SYNC_INCREMENTAL`
`Since Version: 0.14.0` | +| [hoodie.meta.sync.metadata_file_listing](#hoodiemetasyncmetadata_file_listing) | false (Optional) | Enable the internal metadata table for file listing for syncing with metastores

`Config Param: META_SYNC_USE_FILE_LISTING_FROM_METADATA` | +| [hoodie.meta.sync.sync_snapshot_with_table_name](#hoodiemetasyncsync_snapshot_with_table_name) | false (Optional) | sync meta info to origin table if enable

`Config Param: META_SYNC_SNAPSHOT_WITH_TABLE_NAME`
`Since Version: 0.14.0` | +| [hoodie.meta_sync.spark.version](#hoodiemeta_syncsparkversion) | (Optional) | The spark version used when syncing with a metastore.

`Config Param: META_SYNC_SPARK_VERSION` | --- ## Metrics Configs {#METRICS} @@ -1348,12 +1348,12 @@ Enables reporting on Hudi metrics using Amazon CloudWatch. Hudi publishes metri [**Advanced Configs**](#Metrics-Configurations-for-Amazon-CloudWatch-advanced-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------- | --------------- | -------------------------------- | ------------- | -| [hoodie.metrics.cloudwatch.maxDatumsPerRequest](#hoodiemetricscloudwatchmaxDatumsPerRequest) | 20 (Optional) | Max number of Datums per request | 0.10.0 | -| [hoodie.metrics.cloudwatch.metric.prefix](#hoodiemetricscloudwatchmetricprefix) | (Optional) | Metric prefix of reporter | 0.10.0 | -| [hoodie.metrics.cloudwatch.namespace](#hoodiemetricscloudwatchnamespace) | Hudi (Optional) | Namespace of reporter | 0.10.0 | -| [hoodie.metrics.cloudwatch.report.period.seconds](#hoodiemetricscloudwatchreportperiodseconds) | 60 (Optional) | Reporting interval in seconds | 0.10.0 | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------- | --------------- | --------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.cloudwatch.maxDatumsPerRequest](#hoodiemetricscloudwatchmaxDatumsPerRequest) | 20 (Optional) | Max number of Datums per request

`Config Param: MAX_DATUMS_PER_REQUEST`
`Since Version: 0.10.0` | +| [hoodie.metrics.cloudwatch.metric.prefix](#hoodiemetricscloudwatchmetricprefix) | (Optional) | Metric prefix of reporter

`Config Param: METRIC_PREFIX`
`Since Version: 0.10.0` | +| [hoodie.metrics.cloudwatch.namespace](#hoodiemetricscloudwatchnamespace) | Hudi (Optional) | Namespace of reporter

`Config Param: METRIC_NAMESPACE`
`Since Version: 0.10.0` | +| [hoodie.metrics.cloudwatch.report.period.seconds](#hoodiemetricscloudwatchreportperiodseconds) | 60 (Optional) | Reporting interval in seconds

`Config Param: REPORT_PERIOD_SECONDS`
`Since Version: 0.10.0` | --- @@ -1365,21 +1365,22 @@ Enables reporting on Hudi metrics. Hudi publishes metrics on every commit, clean [**Basic Configs**](#Metrics-Configurations-basic-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------- | ------------------- | ---------------------------------------------- | ------------- | -| [hoodie.metrics.on](#hoodiemetricson) | false (Optional) | Turn on/off metrics reporting. off by default. | 0.5.0 | -| [hoodie.metrics.reporter.type](#hoodiemetricsreportertype) | GRAPHITE (Optional) | Type of metrics reporter. | 0.5.0 | +| Config Name | Default | Description | +| ----------------------------------------------------------------------------- | ------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.on](#hoodiemetricson) | false (Optional) | Turn on/off metrics reporting. off by default.

`Config Param: TURN_METRICS_ON`
`Since Version: 0.5.0` | +| [hoodie.metrics.reporter.type](#hoodiemetricsreportertype) | GRAPHITE (Optional) | Type of metrics reporter.

`Config Param: METRICS_REPORTER_TYPE_VALUE`
`Since Version: 0.5.0` | +| [hoodie.metricscompaction.log.blocks.on](#hoodiemetricscompactionlogblockson) | false (Optional) | Turn on/off metrics reporting for log blocks with compaction commit. off by default.

`Config Param: TURN_METRICS_COMPACTION_LOG_BLOCKS_ON`
`Since Version: 0.14.0` | [**Advanced Configs**](#Metrics-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------- | ------------------ | --------------------------------------------------------------------------- | ------------- | -| [hoodie.metrics.executor.enable](#hoodiemetricsexecutorenable) | N/A **(Required)** | | 0.7.0 | -| [hoodie.metrics.configs.properties](#hoodiemetricsconfigsproperties) | (Optional) | Comma separated list of config file paths for metric exporter configs | 0.14.0 | -| [hoodie.metrics.lock.enable](#hoodiemetricslockenable) | false (Optional) | Enable metrics for locking infra. Useful when operating in multiwriter mode | 0.13.0 | -| [hoodie.metrics.reporter.class](#hoodiemetricsreporterclass) | (Optional) | | 0.6.0 | -| [hoodie.metrics.reporter.metricsname.prefix](#hoodiemetricsreportermetricsnameprefix) | (Optional) | The prefix given to the metrics names. | 0.11.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.executor.enable](#hoodiemetricsexecutorenable) | N/A **(Required)** |

`Config Param: EXECUTOR_METRICS_ENABLE`
`Since Version: 0.7.0` | +| [hoodie.metrics.configs.properties](#hoodiemetricsconfigsproperties) | (Optional) | Comma separated list of config file paths for metric exporter configs

`Config Param: METRICS_REPORTER_FILE_BASED_CONFIGS_PATH`
`Since Version: 0.14.0` | +| [hoodie.metrics.lock.enable](#hoodiemetricslockenable) | false (Optional) | Enable metrics for locking infra. Useful when operating in multiwriter mode

`Config Param: LOCK_METRICS_ENABLE`
`Since Version: 0.13.0` | +| [hoodie.metrics.reporter.class](#hoodiemetricsreporterclass) | (Optional) |

`Config Param: METRICS_REPORTER_CLASS_NAME`
`Since Version: 0.6.0` | +| [hoodie.metrics.reporter.metricsname.prefix](#hoodiemetricsreportermetricsnameprefix) | (Optional) | The prefix given to the metrics names.

`Config Param: METRICS_REPORTER_PREFIX`
`Since Version: 0.11.0` | --- @@ -1391,17 +1392,17 @@ Enables reporting on Hudi metrics using the Datadog reporter type. Hudi publishe [**Advanced Configs**](#Metrics-Configurations-for-Datadog-reporter-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------- | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.metrics.datadog.api.key](#hoodiemetricsdatadogapikey) | N/A **(Required)** | Datadog API key | 0.6.0 | -| [hoodie.metrics.datadog.api.key.supplier](#hoodiemetricsdatadogapikeysupplier) | N/A **(Required)** | Datadog API key supplier to supply the API key at runtime. This will take effect if hoodie.metrics.datadog.api.key is not set. | 0.6.0 | -| [hoodie.metrics.datadog.api.site](#hoodiemetricsdatadogapisite) | N/A **(Required)** | Datadog API site: EU or US | 0.6.0 | -| [hoodie.metrics.datadog.metric.host](#hoodiemetricsdatadogmetrichost) | N/A **(Required)** | Datadog metric host to be sent along with metrics data. | 0.6.0 | -| [hoodie.metrics.datadog.metric.prefix](#hoodiemetricsdatadogmetricprefix) | N/A **(Required)** | Datadog metric prefix to be prepended to each metric name with a dot as delimiter. For example, if it is set to foo, foo. will be prepended. | 0.6.0 | -| [hoodie.metrics.datadog.metric.tags](#hoodiemetricsdatadogmetrictags) | N/A **(Required)** | Datadog metric tags (comma-delimited) to be sent along with metrics data. | 0.6.0 | -| [hoodie.metrics.datadog.api.key.skip.validation](#hoodiemetricsdatadogapikeyskipvalidation) | false (Optional) | Before sending metrics via Datadog API, whether to skip validating Datadog API key or not. Default to false. | 0.6.0 | -| [hoodie.metrics.datadog.api.timeout.seconds](#hoodiemetricsdatadogapitimeoutseconds) | 3 (Optional) | Datadog API timeout in seconds. Default to 3. | 0.6.0 | -| [hoodie.metrics.datadog.report.period.seconds](#hoodiemetricsdatadogreportperiodseconds) | 30 (Optional) | Datadog reporting period in seconds. Default to 30. | 0.6.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.datadog.api.key](#hoodiemetricsdatadogapikey) | N/A **(Required)** | Datadog API key

`Config Param: API_KEY`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.api.key.supplier](#hoodiemetricsdatadogapikeysupplier) | N/A **(Required)** | Datadog API key supplier to supply the API key at runtime. This will take effect if hoodie.metrics.datadog.api.key is not set.

`Config Param: API_KEY_SUPPLIER`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.api.site](#hoodiemetricsdatadogapisite) | N/A **(Required)** | Datadog API site: EU or US

`Config Param: API_SITE_VALUE`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.metric.host](#hoodiemetricsdatadogmetrichost) | N/A **(Required)** | Datadog metric host to be sent along with metrics data.

`Config Param: METRIC_HOST_NAME`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.metric.prefix](#hoodiemetricsdatadogmetricprefix) | N/A **(Required)** | Datadog metric prefix to be prepended to each metric name with a dot as delimiter. For example, if it is set to foo, foo. will be prepended.

`Config Param: METRIC_PREFIX_VALUE`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.metric.tags](#hoodiemetricsdatadogmetrictags) | N/A **(Required)** | Datadog metric tags (comma-delimited) to be sent along with metrics data.

`Config Param: METRIC_TAG_VALUES`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.api.key.skip.validation](#hoodiemetricsdatadogapikeyskipvalidation) | false (Optional) | Before sending metrics via Datadog API, whether to skip validating Datadog API key or not. Default to false.

`Config Param: API_KEY_SKIP_VALIDATION`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.api.timeout.seconds](#hoodiemetricsdatadogapitimeoutseconds) | 3 (Optional) | Datadog API timeout in seconds. Default to 3.

`Config Param: API_TIMEOUT_IN_SECONDS`
`Since Version: 0.6.0` | +| [hoodie.metrics.datadog.report.period.seconds](#hoodiemetricsdatadogreportperiodseconds) | 30 (Optional) | Datadog reporting period in seconds. Default to 30.

`Config Param: REPORT_PERIOD_IN_SECONDS`
`Since Version: 0.6.0` | --- @@ -1413,12 +1414,12 @@ Enables reporting on Hudi metrics using Graphite. Hudi publishes metrics on eve [**Advanced Configs**](#Metrics-Configurations-for-Graphite-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------ | -------------------- | ----------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.metrics.graphite.metric.prefix](#hoodiemetricsgraphitemetricprefix) | N/A **(Required)** | Standard prefix applied to all metrics. This helps to add datacenter, environment information for e.g | 0.5.1 | -| [hoodie.metrics.graphite.host](#hoodiemetricsgraphitehost) | localhost (Optional) | Graphite host to connect to. | 0.5.0 | -| [hoodie.metrics.graphite.port](#hoodiemetricsgraphiteport) | 4756 (Optional) | Graphite port to connect to. | 0.5.0 | -| [hoodie.metrics.graphite.report.period.seconds](#hoodiemetricsgraphitereportperiodseconds) | 30 (Optional) | Graphite reporting period in seconds. Default to 30. | 0.10.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------ | -------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.graphite.metric.prefix](#hoodiemetricsgraphitemetricprefix) | N/A **(Required)** | Standard prefix applied to all metrics. This helps to add datacenter, environment information for e.g

`Config Param: GRAPHITE_METRIC_PREFIX_VALUE`
`Since Version: 0.5.1` | +| [hoodie.metrics.graphite.host](#hoodiemetricsgraphitehost) | localhost (Optional) | Graphite host to connect to.

`Config Param: GRAPHITE_SERVER_HOST_NAME`
`Since Version: 0.5.0` | +| [hoodie.metrics.graphite.port](#hoodiemetricsgraphiteport) | 4756 (Optional) | Graphite port to connect to.

`Config Param: GRAPHITE_SERVER_PORT_NUM`
`Since Version: 0.5.0` | +| [hoodie.metrics.graphite.report.period.seconds](#hoodiemetricsgraphitereportperiodseconds) | 30 (Optional) | Graphite reporting period in seconds. Default to 30.

`Config Param: GRAPHITE_REPORT_PERIOD_IN_SECONDS`
`Since Version: 0.10.0` | --- @@ -1430,10 +1431,10 @@ Enables reporting on Hudi metrics using Jmx. Hudi publishes metrics on every co [**Advanced Configs**](#Metrics-Configurations-for-Jmx-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------ | -------------------- | ---------------------- | ------------- | -| [hoodie.metrics.jmx.host](#hoodiemetricsjmxhost) | localhost (Optional) | Jmx host to connect to | 0.5.1 | -| [hoodie.metrics.jmx.port](#hoodiemetricsjmxport) | 9889 (Optional) | Jmx port to connect to | 0.5.1 | +| Config Name | Default | Description | +| ------------------------------------------------ | -------------------- | ------------------------------------------------------------------------------------------- | +| [hoodie.metrics.jmx.host](#hoodiemetricsjmxhost) | localhost (Optional) | Jmx host to connect to

`Config Param: JMX_HOST_NAME`
`Since Version: 0.5.1` | +| [hoodie.metrics.jmx.port](#hoodiemetricsjmxport) | 9889 (Optional) | Jmx port to connect to

`Config Param: JMX_PORT_NUM`
`Since Version: 0.5.1` | --- @@ -1445,16 +1446,16 @@ Enables reporting on Hudi metrics using Prometheus. Hudi publishes metrics on e [**Advanced Configs**](#Metrics-Configurations-for-Prometheus-advanced-configs) -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------- | -------------------- | ------------------------------------------------------------------------------------------------------------------ | ------------- | -| [hoodie.metrics.prometheus.port](#hoodiemetricsprometheusport) | 9090 (Optional) | Port for prometheus server. | 0.6.0 | -| [hoodie.metrics.pushgateway.delete.on.shutdown](#hoodiemetricspushgatewaydeleteonshutdown) | true (Optional) | Delete the pushgateway info or not when job shutdown, true by default. | 0.6.0 | -| [hoodie.metrics.pushgateway.host](#hoodiemetricspushgatewayhost) | localhost (Optional) | Hostname of the prometheus push gateway. | 0.6.0 | -| [hoodie.metrics.pushgateway.job.name](#hoodiemetricspushgatewayjobname) | (Optional) | Name of the push gateway job. | 0.6.0 | -| [hoodie.metrics.pushgateway.port](#hoodiemetricspushgatewayport) | 9091 (Optional) | Port for the push gateway. | 0.6.0 | -| [hoodie.metrics.pushgateway.random.job.name.suffix](#hoodiemetricspushgatewayrandomjobnamesuffix) | true (Optional) | Whether the pushgateway name need a random suffix , default true. | 0.6.0 | -| [hoodie.metrics.pushgateway.report.labels](#hoodiemetricspushgatewayreportlabels) | (Optional) | Label for the metrics emitted to the Pushgateway. Labels can be specified with key:value pairs separated by commas | 0.14.0 | -| [hoodie.metrics.pushgateway.report.period.seconds](#hoodiemetricspushgatewayreportperiodseconds) | 30 (Optional) | Reporting interval in seconds. | 0.6.0 | +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------------------- | -------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.metrics.prometheus.port](#hoodiemetricsprometheusport) | 9090 (Optional) | Port for prometheus server.

`Config Param: PROMETHEUS_PORT_NUM`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.delete.on.shutdown](#hoodiemetricspushgatewaydeleteonshutdown) | true (Optional) | Delete the pushgateway info or not when job shutdown, true by default.

`Config Param: PUSHGATEWAY_DELETE_ON_SHUTDOWN_ENABLE`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.host](#hoodiemetricspushgatewayhost) | localhost (Optional) | Hostname of the prometheus push gateway.

`Config Param: PUSHGATEWAY_HOST_NAME`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.job.name](#hoodiemetricspushgatewayjobname) | (Optional) | Name of the push gateway job.

`Config Param: PUSHGATEWAY_JOBNAME`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.port](#hoodiemetricspushgatewayport) | 9091 (Optional) | Port for the push gateway.

`Config Param: PUSHGATEWAY_PORT_NUM`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.random.job.name.suffix](#hoodiemetricspushgatewayrandomjobnamesuffix) | true (Optional) | Whether the pushgateway name need a random suffix , default true.

`Config Param: PUSHGATEWAY_RANDOM_JOBNAME_SUFFIX`
`Since Version: 0.6.0` | +| [hoodie.metrics.pushgateway.report.labels](#hoodiemetricspushgatewayreportlabels) | (Optional) | Label for the metrics emitted to the Pushgateway. Labels can be specified with key:value pairs separated by commas

`Config Param: PUSHGATEWAY_LABELS`
`Since Version: 0.14.0` | +| [hoodie.metrics.pushgateway.report.period.seconds](#hoodiemetricspushgatewayreportperiodseconds) | 30 (Optional) | Reporting interval in seconds.

`Config Param: PUSHGATEWAY_REPORT_PERIOD_IN_SECONDS`
`Since Version: 0.6.0` | --- ## Record Payload Config {#RECORD_PAYLOAD} @@ -1469,11 +1470,11 @@ Payload related configs, that can be leveraged to control merges based on specif [**Advanced Configs**](#Payload-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------- | ---------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.compaction.payload.class](#hoodiecompactionpayloadclass) | org.apache.hudi.common.model.OverwriteWithLatestAvroPayload (Optional) | This needs to be same as class used during insert/upserts. Just like writing, compaction also uses the record payload class to merge records in the log against each other, merge again with the base file and produce the final record to be written after compaction. | | -| [hoodie.payload.event.time.field](#hoodiepayloadeventtimefield) | ts (Optional) | Table column/field name to derive timestamp associated with the records. This canbe useful for e.g, determining the freshness of the table. | | -| [hoodie.payload.ordering.field](#hoodiepayloadorderingfield) | ts (Optional) | Table column/field name to order records that have the same key, before merging and writing to storage. | | +| Config Name | Default | Description | +| ---------------------------------------------------------------- | ---------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.compaction.payload.class](#hoodiecompactionpayloadclass) | org.apache.hudi.common.model.OverwriteWithLatestAvroPayload (Optional) | This needs to be same as class used during insert/upserts. Just like writing, compaction also uses the record payload class to merge records in the log against each other, merge again with the base file and produce the final record to be written after compaction.

`Config Param: PAYLOAD_CLASS_NAME` | +| [hoodie.payload.event.time.field](#hoodiepayloadeventtimefield) | ts (Optional) | Table column/field name to derive timestamp associated with the records. This canbe useful for e.g, determining the freshness of the table.

`Config Param: EVENT_TIME_FIELD` | +| [hoodie.payload.ordering.field](#hoodiepayloadorderingfield) | ts (Optional) | Table column/field name to order records that have the same key, before merging and writing to storage.

`Config Param: ORDERING_FIELD` | --- ## Kafka Connect Configs {#KAFKA_CONNECT} @@ -1488,25 +1489,25 @@ Configurations for Kafka Connect Sink Connector for Hudi. [**Basic Configs**](#Kafka-Sink-Connect-Configurations-basic-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------- | ------------------------- | -------------------------------------------- | ------------- | -| [bootstrap.servers](#bootstrapservers) | localhost:9092 (Optional) | The bootstrap servers for the Kafka Cluster. | | +| Config Name | Default | Description | +| -------------------------------------- | ------------------------- | ----------------------------------------------------------------------------------------------- | +| [bootstrap.servers](#bootstrapservers) | localhost:9092 (Optional) | The bootstrap servers for the Kafka Cluster.

`Config Param: KAFKA_BOOTSTRAP_SERVERS` | [**Advanced Configs**](#Kafka-Sink-Connect-Configurations-advanced-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------------- | --------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hadoop.conf.dir](#hadoopconfdir) | N/A **(Required)** | The Hadoop configuration directory. | | -| [hadoop.home](#hadoophome) | N/A **(Required)** | The Hadoop home directory. | | -| [hoodie.kafka.allow.commit.on.errors](#hoodiekafkaallowcommitonerrors) | true (Optional) | Commit even when some records failed to be written | | -| [hoodie.kafka.commit.interval.secs](#hoodiekafkacommitintervalsecs) | 60 (Optional) | The interval at which Hudi will commit the records written to the files, making them consumable on the read-side. | | -| [hoodie.kafka.compaction.async.enable](#hoodiekafkacompactionasyncenable) | true (Optional) | Controls whether async compaction should be turned on for MOR table writing. | | -| [hoodie.kafka.control.topic](#hoodiekafkacontroltopic) | hudi-control-topic (Optional) | Kafka topic name used by the Hudi Sink Connector for sending and receiving control messages. Not used for data records. | | -| [hoodie.kafka.coordinator.write.timeout.secs](#hoodiekafkacoordinatorwritetimeoutsecs) | 300 (Optional) | The timeout after sending an END_COMMIT until when the coordinator will wait for the write statuses from all the partitionsto ignore the current commit and start a new commit. | | -| [hoodie.meta.sync.classes](#hoodiemetasyncclasses) | org.apache.hudi.hive.HiveSyncTool (Optional) | Meta sync client tool, using comma to separate multi tools | | -| [hoodie.meta.sync.enable](#hoodiemetasyncenable) | false (Optional) | Enable Meta Sync such as Hive | | -| [hoodie.schemaprovider.class](#hoodieschemaproviderclass) | org.apache.hudi.schema.FilebasedSchemaProvider (Optional) | subclass of org.apache.hudi.schema.SchemaProvider to attach schemas to input & target table data, built in options: org.apache.hudi.schema.FilebasedSchemaProvider. | | +| Config Name | Default | Description | +| -------------------------------------------------------------------------------------- | --------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hadoop.conf.dir](#hadoopconfdir) | N/A **(Required)** | The Hadoop configuration directory.

`Config Param: HADOOP_CONF_DIR` | +| [hadoop.home](#hadoophome) | N/A **(Required)** | The Hadoop home directory.

`Config Param: HADOOP_HOME` | +| [hoodie.kafka.allow.commit.on.errors](#hoodiekafkaallowcommitonerrors) | true (Optional) | Commit even when some records failed to be written

`Config Param: ALLOW_COMMIT_ON_ERRORS` | +| [hoodie.kafka.commit.interval.secs](#hoodiekafkacommitintervalsecs) | 60 (Optional) | The interval at which Hudi will commit the records written to the files, making them consumable on the read-side.

`Config Param: COMMIT_INTERVAL_SECS` | +| [hoodie.kafka.compaction.async.enable](#hoodiekafkacompactionasyncenable) | true (Optional) | Controls whether async compaction should be turned on for MOR table writing.

`Config Param: ASYNC_COMPACT_ENABLE` | +| [hoodie.kafka.control.topic](#hoodiekafkacontroltopic) | hudi-control-topic (Optional) | Kafka topic name used by the Hudi Sink Connector for sending and receiving control messages. Not used for data records.

`Config Param: CONTROL_TOPIC_NAME` | +| [hoodie.kafka.coordinator.write.timeout.secs](#hoodiekafkacoordinatorwritetimeoutsecs) | 300 (Optional) | The timeout after sending an END_COMMIT until when the coordinator will wait for the write statuses from all the partitionsto ignore the current commit and start a new commit.

`Config Param: COORDINATOR_WRITE_TIMEOUT_SECS` | +| [hoodie.meta.sync.classes](#hoodiemetasyncclasses) | org.apache.hudi.hive.HiveSyncTool (Optional) | Meta sync client tool, using comma to separate multi tools

`Config Param: META_SYNC_CLASSES` | +| [hoodie.meta.sync.enable](#hoodiemetasyncenable) | false (Optional) | Enable Meta Sync such as Hive

`Config Param: META_SYNC_ENABLE` | +| [hoodie.schemaprovider.class](#hoodieschemaproviderclass) | org.apache.hudi.schema.FilebasedSchemaProvider (Optional) | subclass of org.apache.hudi.schema.SchemaProvider to attach schemas to input & target table data, built in options: org.apache.hudi.schema.FilebasedSchemaProvider.

`Config Param: SCHEMA_PROVIDER_CLASS` | --- ## Amazon Web Services Configs {#AWS} @@ -1521,447 +1522,42 @@ Amazon Web Services configurations to access resources like Amazon DynamoDB (for [**Advanced Configs**](#Amazon-Web-Services-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| -------------------------------------------------- | ------------------ | ----------------- | ------------- | -| [hoodie.aws.access.key](#hoodieawsaccesskey) | N/A **(Required)** | AWS access key id | 0.10.0 | -| [hoodie.aws.secret.key](#hoodieawssecretkey) | N/A **(Required)** | AWS secret key | 0.10.0 | -| [hoodie.aws.session.token](#hoodieawssessiontoken) | N/A **(Required)** | AWS session token | 0.10.0 | ---- - -## DeltaStreamer Configs {#DELTA_STREAMER} -These set of configs are used for DeltaStreamer utility which provides the way to ingest from different sources such as DFS or Kafka. - - -### DeltaStreamer Configs {#DeltaStreamer-Configs} -Configurations that control DeltaStreamer. - - - -[**Basic Configs**](#DeltaStreamer-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------- | ------------------ | ----------------- | ------------- | -| [hoodie.deltastreamer.source.kafka.topic](#hoodiedeltastreamersourcekafkatopic) | N/A **(Required)** | Kafka topic name. | | - -[**Advanced Configs**](#DeltaStreamer-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.checkpoint.provider.path](#hoodiedeltastreamercheckpointproviderpath) | N/A **(Required)** | The path for providing the checkpoints. | | -| [hoodie.deltastreamer.ingestion.tablesToBeIngested](#hoodiedeltastreameringestiontablesToBeIngested) | N/A **(Required)** | Comma separated names of tables to be ingested in the format <database>.<table>, for example db1.table1,db1.table2 | | -| [hoodie.deltastreamer.ingestion.targetBasePath](#hoodiedeltastreameringestiontargetBasePath) | N/A **(Required)** | The path to which a particular table is ingested. The config is specific to HoodieMultiTableDeltaStreamer and overrides path determined using option `--base-path-prefix` for a table. This config is ignored for a single table deltastreamer | | -| [hoodie.deltastreamer.multiwriter.source.checkpoint.id](#hoodiedeltastreamermultiwritersourcecheckpointid) | N/A **(Required)** | Unique Id to be used for multi-writer deltastreamer scenario. This is the scenario when multiple deltastreamers are used to write to the same target table. If you are just using a single deltastreamer for a table then you do not need to set this config. | | -| [hoodie.deltastreamer.source.kafka.append.offsets](#hoodiedeltastreamersourcekafkaappendoffsets) | false (Optional) | When enabled, appends kafka offset info like source offset(_hoodie_kafka_source_offset), partition (_hoodie_kafka_source_partition) and timestamp (_hoodie_kafka_source_timestamp) to the records. By default its disabled and no kafka offsets are added | | -| [hoodie.deltastreamer.source.sanitize.invalid.char.mask](#hoodiedeltastreamersourcesanitizeinvalidcharmask) | __ (Optional) | Defines the character sequence that replaces invalid characters in schema field names if hoodie.deltastreamer.source.sanitize.invalid.schema.field.names is enabled. | | -| [hoodie.deltastreamer.source.sanitize.invalid.schema.field.names](#hoodiedeltastreamersourcesanitizeinvalidschemafieldnames) | false (Optional) | Sanitizes names of invalid schema fields both in the data read from source and also in the schema Replaces invalid characters with hoodie.deltastreamer.source.sanitize.invalid.char.mask. Invalid characters are by goes by avro naming convention (https://avro.apache.org/docs/current/spec.html#names). | | ---- - - -### DeltaStreamer SQL Transformer Configs {#DeltaStreamer-SQL-Transformer-Configs} -Configurations controlling the behavior of SQL transformer in Deltastreamer. - - - -[**Basic Configs**](#DeltaStreamer-SQL-Transformer-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------- | ------------------ | -------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.transformer.sql](#hoodiedeltastreamertransformersql) | N/A **(Required)** | SQL Query to be executed during write | | -| [hoodie.deltastreamer.transformer.sql.file](#hoodiedeltastreamertransformersqlfile) | N/A **(Required)** | File with a SQL script to be executed during write | | ---- - - -### DeltaStreamer Source Configs {#DELTA_STREAMER_SOURCE} -Configurations controlling the behavior of reading source data. - - -#### Cloud Source Configs {#Cloud-Source-Configs} -Configs that are common during ingestion across different cloud stores - - - -[**Advanced Configs**](#Cloud-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.cloud.data.datasource.options](#hoodiedeltastreamersourceclouddatadatasourceoptions) | N/A **(Required)** | A JSON string passed to the Spark DataFrameReader while loading the dataset. Example: hoodie.deltastreamer.gcp.spark.datasource.options={"header":"true","encoding":"UTF-8"} | | -| [hoodie.deltastreamer.source.cloud.data.ignore.relpath.prefix](#hoodiedeltastreamersourceclouddataignorerelpathprefix) | N/A **(Required)** | Ignore objects in the bucket whose relative path matches this prefix | | -| [hoodie.deltastreamer.source.cloud.data.ignore.relpath.substring](#hoodiedeltastreamersourceclouddataignorerelpathsubstring) | N/A **(Required)** | Ignore objects in the bucket whose relative path contains this substring | | -| [hoodie.deltastreamer.source.cloud.data.select.file.extension](#hoodiedeltastreamersourceclouddataselectfileextension) | N/A **(Required)** | Only match files with this extension. By default, this is the same as hoodie.deltastreamer.source.hoodieincr.file.format | | -| [hoodie.deltastreamer.source.cloud.data.select.relpath.prefix](#hoodiedeltastreamersourceclouddataselectrelpathprefix) | N/A **(Required)** | Only selects objects in the bucket whose relative path matches this prefix | | -| [hoodie.deltastreamer.source.cloud.data.check.file.exists](#hoodiedeltastreamersourceclouddatacheckfileexists) | false (Optional) | If true, checks whether file exists before attempting to pull it | | -| [hoodie.deltastreamer.source.cloud.data.datafile.format](#hoodiedeltastreamersourceclouddatadatafileformat) | parquet (Optional) | Format of the data file. By default, this will be the same as hoodie.deltastreamer.source.hoodieincr.file.format | | -| [hoodie.deltastreamer.source.cloud.meta.ack](#hoodiedeltastreamersourcecloudmetaack) | true (Optional) | Whether to acknowledge Metadata messages during Cloud Ingestion or not. This is useful during dev and testing. In Prod this should always be true. In case of Cloud Pubsub, not acknowledging means Pubsub will keep redelivering the same messages. | | -| [hoodie.deltastreamer.source.cloud.meta.batch.size](#hoodiedeltastreamersourcecloudmetabatchsize) | 10 (Optional) | Number of metadata messages to pull at a time | | ---- - - -#### DFS Path Selector Configs {#DFS-Path-Selector-Configs} -Configurations controlling the behavior of path selector for DFS source in Deltastreamer. - - - -[**Basic Configs**](#DFS-Path-Selector-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------- | ------------------ | ------------------------------ | ------------- | -| [hoodie.deltastreamer.source.dfs.root](#hoodiedeltastreamersourcedfsroot) | N/A **(Required)** | Root path of the source on DFS | | - -[**Advanced Configs**](#DFS-Path-Selector-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------- | ------------------ | --------------------- | ------------- | -| [hoodie.deltastreamer.source.input.selector](#hoodiedeltastreamersourceinputselector) | N/A **(Required)** | Source input selector | | ---- - - -#### Date Partition Path Selector Configs {#Date-Partition-Path-Selector-Configs} -Configurations controlling the behavior of date partition path selector for DFS source in Deltastreamer. - - - -[**Advanced Configs**](#Date-Partition-Path-Selector-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------------------------------ | --------------------- | ---------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.dfs.datepartitioned.selector.currentdate](#hoodiedeltastreamersourcedfsdatepartitionedselectorcurrentdate) | N/A **(Required)** | Current date. | | -| [hoodie.deltastreamer.source.dfs.datepartitioned.date.format](#hoodiedeltastreamersourcedfsdatepartitioneddateformat) | yyyy-MM-dd (Optional) | Date format. | | -| [hoodie.deltastreamer.source.dfs.datepartitioned.selector.depth](#hoodiedeltastreamersourcedfsdatepartitionedselectordepth) | 0 (Optional) | Depth of the files to scan. 0 implies no (date) partition. | | -| [hoodie.deltastreamer.source.dfs.datepartitioned.selector.lookback.days](#hoodiedeltastreamersourcedfsdatepartitionedselectorlookbackdays) | 2 (Optional) | The maximum look-back days for scanning. | | -| [hoodie.deltastreamer.source.dfs.datepartitioned.selector.parallelism](#hoodiedeltastreamersourcedfsdatepartitionedselectorparallelism) | 20 (Optional) | Parallelism for listing partitions. | | ---- - - -#### GCS Events Source Configs {#GCS-Events-Source-Configs} -Configurations controlling the behavior of GCS Events Source in Deltastreamer. - - - -[**Advanced Configs**](#GCS-Events-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.gcs.project.id](#hoodiedeltastreamersourcegcsprojectid) | N/A **(Required)** | The GCP Project Id where the Pubsub Subscription to ingest from resides. Needed to connect to the Pubsub subscription | | -| [hoodie.deltastreamer.source.gcs.subscription.id](#hoodiedeltastreamersourcegcssubscriptionid) | N/A **(Required)** | The GCP Pubsub subscription id for the GCS Notifications. Needed to connect to the Pubsub subscription | | ---- - - -#### Hive Incremental Pulling Source Configs {#Hive-Incremental-Pulling-Source-Configs} -Configurations controlling the behavior of incremental pulling from a Hive table as a source in Deltastreamer. - - - -[**Advanced Configs**](#Hive-Incremental-Pulling-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------- | ------------------ | ------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.incrpull.root](#hoodiedeltastreamersourceincrpullroot) | N/A **(Required)** | The root path of Hive incremental pulling source. | | ---- - - -#### Hudi Incremental Source Configs {#Hudi-Incremental-Source-Configs} -Configurations controlling the behavior of incremental pulling from a Hudi table as a source in Deltastreamer. - - - -[**Basic Configs**](#Hudi-Incremental-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------- | ------------------ | ----------------------------------- | ------------- | -| [hoodie.deltastreamer.source.hoodieincr.path](#hoodiedeltastreamersourcehoodieincrpath) | N/A **(Required)** | Base-path for the source Hudi table | | - -[**Advanced Configs**](#Hudi-Incremental-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.hoodieincr.missing.checkpoint.strategy](#hoodiedeltastreamersourcehoodieincrmissingcheckpointstrategy) | N/A **(Required)** | Allows delta-streamer to decide the instant to consume from when checkpoint is not set. Possible values: [READ_LATEST (Read from latest commit in hoodie source table), READ_UPTO_LATEST_COMMIT (Read everything upto latest commit)] | | -| [hoodie.deltastreamer.source.hoodieincr.partition.extractor.class](#hoodiedeltastreamersourcehoodieincrpartitionextractorclass) | N/A **(Required)** | PartitionValueExtractor class to extract partition fields from _hoodie_partition_path | | -| [hoodie.deltastreamer.source.hoodieincr.partition.fields](#hoodiedeltastreamersourcehoodieincrpartitionfields) | N/A **(Required)** | Specifies partition fields that needs to be added to source table after parsing _hoodie_partition_path. | | -| [hoodie.deltastreamer.source.hoodieincr.drop.all.meta.fields.from.source](#hoodiedeltastreamersourcehoodieincrdropallmetafieldsfromsource) | false (Optional) | Drops all meta fields from the source hudi table while ingesting into sink hudi table. | | -| [hoodie.deltastreamer.source.hoodieincr.file.format](#hoodiedeltastreamersourcehoodieincrfileformat) | parquet (Optional) | This config is passed to the reader while loading dataset. Default value is parquet. | | -| [hoodie.deltastreamer.source.hoodieincr.num_instants](#hoodiedeltastreamersourcehoodieincrnum_instants) | 5 (Optional) | Max number of instants whose changes can be incrementally fetched | | -| [hoodie.deltastreamer.source.hoodieincr.read_latest_on_missing_ckpt](#hoodiedeltastreamersourcehoodieincrread_latest_on_missing_ckpt) | false (Optional) | If true, allows delta-streamer to incrementally fetch from latest committed instant when checkpoint is not provided. This config is deprecated. Please refer to hoodie.deltastreamer.source.hoodieincr.missing.checkpoint.strategy | | ---- - - -#### JDBC Source Configs {#JDBC-Source-Configs} -Configurations controlling the behavior of JDBC source in Deltastreamer. - - - -[**Advanced Configs**](#JDBC-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.jdbc.driver.class](#hoodiedeltastreamerjdbcdriverclass) | N/A **(Required)** | Driver class used for JDBC connection | | -| [hoodie.deltastreamer.jdbc.extra.options.](#hoodiedeltastreamerjdbcextraoptions) | N/A **(Required)** | Used to set any extra options the user specifies for jdbc | | -| [hoodie.deltastreamer.jdbc.incr.fallback.to.full.fetch](#hoodiedeltastreamerjdbcincrfallbacktofullfetch) | N/A **(Required)** | If set true, makes incremental fetch to fallback to full fetch in case of any error | | -| [hoodie.deltastreamer.jdbc.incr.pull](#hoodiedeltastreamerjdbcincrpull) | N/A **(Required)** | Will the JDBC source do an incremental pull? | | -| [hoodie.deltastreamer.jdbc.password](#hoodiedeltastreamerjdbcpassword) | N/A **(Required)** | Password used for JDBC connection | | -| [hoodie.deltastreamer.jdbc.password.file](#hoodiedeltastreamerjdbcpasswordfile) | N/A **(Required)** | Base-path for the JDBC password file. | | -| [hoodie.deltastreamer.jdbc.storage.level](#hoodiedeltastreamerjdbcstoragelevel) | N/A **(Required)** | Used to control the persistence level. Default value: MEMORY_AND_DISK_SER | | -| [hoodie.deltastreamer.jdbc.table.incr.column.name](#hoodiedeltastreamerjdbctableincrcolumnname) | N/A **(Required)** | If run in incremental mode, this field is to pull new data incrementally | | -| [hoodie.deltastreamer.jdbc.table.name](#hoodiedeltastreamerjdbctablename) | N/A **(Required)** | RDBMS table to pull | | -| [hoodie.deltastreamer.jdbc.url](#hoodiedeltastreamerjdbcurl) | N/A **(Required)** | JDBC url for the Hoodie datasource. | | -| [hoodie.deltastreamer.jdbc.user](#hoodiedeltastreamerjdbcuser) | N/A **(Required)** | Username used for JDBC connection | | ---- - - -#### Json Kafka Post Processor Configs {#Json-Kafka-Post-Processor-Configs} -Configurations controlling the post processor of Json Kafka Source in Deltastreamer. - - - -[**Advanced Configs**](#Json-Kafka-Post-Processor-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.json.kafka.post.processor.maxwell.database.regex](#hoodiedeltastreamersourcejsonkafkapostprocessormaxwelldatabaseregex) | N/A **(Required)** | Database name regex | | -| [hoodie.deltastreamer.source.json.kafka.post.processor.maxwell.table.regex](#hoodiedeltastreamersourcejsonkafkapostprocessormaxwelltableregex) | N/A **(Required)** | Table name regex | | -| [hoodie.deltastreamer.source.json.kafka.processor.class](#hoodiedeltastreamersourcejsonkafkaprocessorclass) | N/A **(Required)** | Json kafka source post processor class name, post process data after consuming fromsource and before giving it to deltastreamer. | | -| [hoodie.deltastreamer.source.json.kafka.post.processor.maxwell.precombine.field.format](#hoodiedeltastreamersourcejsonkafkapostprocessormaxwellprecombinefieldformat) | yyyy-MM-dd HH:mm:ss (Optional) | When the preCombine filed is in DATE_STRING format, use should tell hoodiewhat format it is. 'yyyy-MM-dd HH:mm:ss' by default | | -| [hoodie.deltastreamer.source.json.kafka.post.processor.maxwell.precombine.field.type](#hoodiedeltastreamersourcejsonkafkapostprocessormaxwellprecombinefieldtype) | DATE_STRING (Optional) | Data type of the preCombine field. could be NON_TIMESTAMP, DATE_STRING,UNIX_TIMESTAMP or EPOCHMILLISECONDS. DATE_STRING by default | | ---- - - -#### Kafka Source Configs {#Kafka-Source-Configs} -Configurations controlling the behavior of Kafka source in Deltastreamer. - - - -[**Basic Configs**](#Kafka-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------- | ------------------ | ----------------- | ------------- | -| [hoodie.deltastreamer.source.kafka.topic](#hoodiedeltastreamersourcekafkatopic) | N/A **(Required)** | Kafka topic name. | | - -[**Advanced Configs**](#Kafka-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------------- | ------------------ | ------------------------------------------------------------------------------- | ------------- | -| [auto.offset.reset](#autooffsetreset) | LATEST (Optional) | Kafka consumer strategy for reading data. | | -| [hoodie.deltastreamer.kafka.source.maxEvents](#hoodiedeltastreamerkafkasourcemaxEvents) | 5000000 (Optional) | Maximum number of records obtained in each batch. | | -| [hoodie.deltastreamer.source.kafka.checkpoint.type](#hoodiedeltastreamersourcekafkacheckpointtype) | string (Optional) | Kafka checkpoint type. | | -| [hoodie.deltastreamer.source.kafka.enable.commit.offset](#hoodiedeltastreamersourcekafkaenablecommitoffset) | false (Optional) | Automatically submits offset to kafka. | | -| [hoodie.deltastreamer.source.kafka.enable.failOnDataLoss](#hoodiedeltastreamersourcekafkaenablefailOnDataLoss) | false (Optional) | Fail when checkpoint goes out of bounds instead of seeking to earliest offsets. | | -| [hoodie.deltastreamer.source.kafka.fetch_partition.time.out](#hoodiedeltastreamersourcekafkafetch_partitiontimeout) | 300000 (Optional) | Time out for fetching partitions. 5min by default | | ---- - - -#### Pulsar Source Configs {#Pulsar-Source-Configs} -Configurations controlling the behavior of Pulsar source in Deltastreamer. - - - -[**Basic Configs**](#Pulsar-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------- | ---------------------------------- | ------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.pulsar.topic](#hoodiedeltastreamersourcepulsartopic) | N/A **(Required)** | Name of the target Pulsar topic to source data from | | -| [hoodie.deltastreamer.source.pulsar.endpoint.admin.url](#hoodiedeltastreamersourcepulsarendpointadminurl) | http://localhost:8080 (Optional) | URL of the target Pulsar endpoint (of the form 'pulsar://host:port' | | -| [hoodie.deltastreamer.source.pulsar.endpoint.service.url](#hoodiedeltastreamersourcepulsarendpointserviceurl) | pulsar://localhost:6650 (Optional) | URL of the target Pulsar endpoint (of the form 'pulsar://host:port' | | - -[**Advanced Configs**](#Pulsar-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.pulsar.maxRecords](#hoodiedeltastreamersourcepulsarmaxRecords) | 5000000 (Optional) | Max number of records obtained in a single each batch | | -| [hoodie.deltastreamer.source.pulsar.offset.autoResetStrategy](#hoodiedeltastreamersourcepulsaroffsetautoResetStrategy) | LATEST (Optional) | Policy determining how offsets shall be automatically reset in case there's no checkpoint information present | | ---- - - -#### S3 Event-based Hudi Incremental Source Configs {#S3-Event-based-Hudi-Incremental-Source-Configs} -Configurations controlling the behavior of incremental pulling from S3 events meta information from Hudi table as a source in Deltastreamer. - - - -[**Advanced Configs**](#S3-Event-based-Hudi-Incremental-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.source.s3incr.ignore.key.prefix](#hoodiedeltastreamersources3incrignorekeyprefix) | N/A **(Required)** | Control whether to ignore the s3 objects starting with this prefix | | -| [hoodie.deltastreamer.source.s3incr.ignore.key.substring](#hoodiedeltastreamersources3incrignorekeysubstring) | N/A **(Required)** | Control whether to ignore the s3 objects with this substring | | -| [hoodie.deltastreamer.source.s3incr.key.prefix](#hoodiedeltastreamersources3incrkeyprefix) | N/A **(Required)** | Control whether to filter the s3 objects starting with this prefix | | -| [hoodie.deltastreamer.source.s3incr.spark.datasource.options](#hoodiedeltastreamersources3incrsparkdatasourceoptions) | N/A **(Required)** | Json string, passed to the reader while loading dataset. Example delta streamer conf --hoodie-conf hoodie.deltastreamer.source.s3incr.spark.datasource.options={"header":"true","encoding":"UTF-8"} | | -| [hoodie.deltastreamer.source.s3incr.check.file.exists](#hoodiedeltastreamersources3incrcheckfileexists) | false (Optional) | Control whether we do existence check for files before consuming them | | -| [hoodie.deltastreamer.source.s3incr.fs.prefix](#hoodiedeltastreamersources3incrfsprefix) | s3 (Optional) | The file system prefix. | | ---- - - -#### S3 Source Configs {#S3-Source-Configs} -Configurations controlling the behavior of S3 source in Deltastreamer. - - - -[**Basic Configs**](#S3-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| -------------------------------------------------------------------------------- | ------------------ | --------------------------------- | ------------- | -| [hoodie.deltastreamer.s3.source.queue.url](#hoodiedeltastreamers3sourcequeueurl) | N/A **(Required)** | Queue url for cloud object events | | - -[**Advanced Configs**](#S3-Source-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.s3.source.queue.fs](#hoodiedeltastreamers3sourcequeuefs) | N/A **(Required)** | File system corresponding to queue. For example, for AWS SQS it is s3/s3a. | | -| [hoodie.deltastreamer.s3.source.queue.long.poll.wait](#hoodiedeltastreamers3sourcequeuelongpollwait) | N/A **(Required)** | Long poll wait time in seconds, If set as 0 then client will fetch on short poll basis. | | -| [hoodie.deltastreamer.s3.source.queue.max.messages.per.batch](#hoodiedeltastreamers3sourcequeuemaxmessagesperbatch) | N/A **(Required)** | Max messages for each batch of delta streamer run. Source will process these maximum number of message at a time. | | -| [hoodie.deltastreamer.s3.source.queue.max.messages.per.request](#hoodiedeltastreamers3sourcequeuemaxmessagesperrequest) | N/A **(Required)** | Max messages for each request | | -| [hoodie.deltastreamer.s3.source.queue.region](#hoodiedeltastreamers3sourcequeueregion) | N/A **(Required)** | Case-sensitive region name of the cloud provider for the queue. For example, "us-east-1". | | -| [hoodie.deltastreamer.s3.source.queue.visibility.timeout](#hoodiedeltastreamers3sourcequeuevisibilitytimeout) | N/A **(Required)** | Visibility timeout for messages in queue. After we consume the message, queue will move the consumed messages to in-flight state, these messages can't be consumed again by source for this timeout period. | | ---- - - -#### SQL Source Configs {#SQL-Source-Configs} -Configurations controlling the behavior of SQL source in Deltastreamer. - - - -[**Basic Configs**](#SQL-Source-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------- | ------------------ | ----------------------------------- | ------------- | -| [hoodie.deltastreamer.source.sql.sql.query](#hoodiedeltastreamersourcesqlsqlquery) | N/A **(Required)** | SQL query for fetching source data. | | ---- - - -### DeltaStreamer Schema Provider Configs {#SCHEMA_PROVIDER} -Configurations that control the schema provider for DeltaStreamer. - - -#### DeltaStreamer Schema Provider Configs {#DeltaStreamer-Schema-Provider-Configs} - - - - -[**Basic Configs**](#DeltaStreamer-Schema-Provider-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------- | ------------------ | ------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.registry.targetUrl](#hoodiedeltastreamerschemaproviderregistrytargetUrl) | N/A **(Required)** | The schema of the target you are writing to e.g. https://foo:bar@schemaregistry.org | | -| [hoodie.deltastreamer.schemaprovider.registry.url](#hoodiedeltastreamerschemaproviderregistryurl) | N/A **(Required)** | The schema of the source you are reading from e.g. https://foo:bar@schemaregistry.org | | - -[**Advanced Configs**](#DeltaStreamer-Schema-Provider-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.registry.baseUrl](#hoodiedeltastreamerschemaproviderregistrybaseUrl) | N/A **(Required)** | The base URL of the schema registry. | | -| [hoodie.deltastreamer.schemaprovider.registry.schemaconverter](#hoodiedeltastreamerschemaproviderregistryschemaconverter) | N/A **(Required)** | The class name of the custom schema converter to use. | | -| [hoodie.deltastreamer.schemaprovider.registry.sourceUrlSuffix](#hoodiedeltastreamerschemaproviderregistrysourceUrlSuffix) | N/A **(Required)** | The source URL suffix. | | -| [hoodie.deltastreamer.schemaprovider.registry.targetUrlSuffix](#hoodiedeltastreamerschemaproviderregistrytargetUrlSuffix) | N/A **(Required)** | The target URL suffix. | | -| [hoodie.deltastreamer.schemaprovider.registry.urlSuffix](#hoodiedeltastreamerschemaproviderregistryurlSuffix) | N/A **(Required)** | The suffix of the URL for the schema registry. | | -| [hoodie.deltastreamer.schemaprovider.spark_avro_post_processor.enable](#hoodiedeltastreamerschemaproviderspark_avro_post_processorenable) | false (Optional) | Whether to enable Spark Avro post processor. | | ---- - - -#### File-based Schema Provider Configs {#File-based-Schema-Provider-Configs} -Configurations for file-based schema provider. - - - -[**Basic Configs**](#File-based-Schema-Provider-Configs-basic-configs) - - -| Config Name | Default | Description | Since Version | -| ------------------------------------------------------------------------------------------------------------ | ------------------ | --------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.source.schema.file](#hoodiedeltastreamerschemaprovidersourceschemafile) | N/A **(Required)** | The schema of the source you are reading from | | -| [hoodie.deltastreamer.schemaprovider.target.schema.file](#hoodiedeltastreamerschemaprovidertargetschemafile) | N/A **(Required)** | The schema of the target you are writing to | | ---- - - -#### Hive Schema Provider Configs {#Hive-Schema-Provider-Configs} -Configurations for Hive schema provider. - - - -[**Advanced Configs**](#Hive-Schema-Provider-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.source.schema.hive.database](#hoodiedeltastreamerschemaprovidersourceschemahivedatabase) | N/A **(Required)** | Hive database from where source schema can be fetched | | -| [hoodie.deltastreamer.schemaprovider.source.schema.hive.table](#hoodiedeltastreamerschemaprovidersourceschemahivetable) | N/A **(Required)** | Hive table from where source schema can be fetched | | -| [hoodie.deltastreamer.schemaprovider.target.schema.hive.database](#hoodiedeltastreamerschemaprovidertargetschemahivedatabase) | N/A **(Required)** | Hive database from where target schema can be fetched | | -| [hoodie.deltastreamer.schemaprovider.target.schema.hive.table](#hoodiedeltastreamerschemaprovidertargetschemahivetable) | N/A **(Required)** | Hive table from where target schema can be fetched | | +| Config Name | Default | Description | +| -------------------------------------------------- | ------------------ | ------------------------------------------------------------------------------------------- | +| [hoodie.aws.access.key](#hoodieawsaccesskey) | N/A **(Required)** | AWS access key id

`Config Param: AWS_ACCESS_KEY`
`Since Version: 0.10.0` | +| [hoodie.aws.secret.key](#hoodieawssecretkey) | N/A **(Required)** | AWS secret key

`Config Param: AWS_SECRET_KEY`
`Since Version: 0.10.0` | +| [hoodie.aws.session.token](#hoodieawssessiontoken) | N/A **(Required)** | AWS session token

`Config Param: AWS_SESSION_TOKEN`
`Since Version: 0.10.0` | --- +## Hudi Streamer Configs {#HUDI_STREAMER} +These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. -#### JDBC-based Schema Provider Configs {#JDBC-based-Schema-Provider-Configs} -Configurations for JDBC-based schema provider. +### Hudi Streamer Configs {#Hudi-Streamer-Configs} -[**Advanced Configs**](#JDBC-based-Schema-Provider-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ---------------------------------------------------------------------------------------------------------------------------------------- | ------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.source.schema.jdbc.connection.url](#hoodiedeltastreamerschemaprovidersourceschemajdbcconnectionurl) | N/A **(Required)** | The JDBC URL to connect to. The source-specific connection properties may be specified in the URL. e.g., jdbc:postgresql://localhost/test?user=fred&password=secret | | -| [hoodie.deltastreamer.schemaprovider.source.schema.jdbc.dbtable](#hoodiedeltastreamerschemaprovidersourceschemajdbcdbtable) | N/A **(Required)** | The table with the schema to reference e.g. test_database.test1_table or test1_table | | -| [hoodie.deltastreamer.schemaprovider.source.schema.jdbc.driver.type](#hoodiedeltastreamerschemaprovidersourceschemajdbcdrivertype) | N/A **(Required)** | The class name of the JDBC driver to use to connect to this URL. e.g. org.h2.Driver | | -| [hoodie.deltastreamer.schemaprovider.source.schema.jdbc.nullable](#hoodiedeltastreamerschemaprovidersourceschemajdbcnullable) | N/A **(Required)** | If true, all the columns are nullable. | | -| [hoodie.deltastreamer.schemaprovider.source.schema.jdbc.password](#hoodiedeltastreamerschemaprovidersourceschemajdbcpassword) | N/A **(Required)** | Password for the connection e.g. secret | | -| [hoodie.deltastreamer.schemaprovider.source.schema.jdbc.timeout](#hoodiedeltastreamerschemaprovidersourceschemajdbctimeout) | N/A **(Required)** | The number of seconds the driver will wait for a Statement object to execute. Zero means there is no limit. In the write path, this option depends on how JDBC drivers implement the API setQueryTimeout, e.g., the h2 JDBC driver checks the timeout of each query instead of an entire JDBC batch. It defaults to 0. | | -| [hoodie.deltastreamer.schemaprovider.source.schema.jdbc.username](#hoodiedeltastreamerschemaprovidersourceschemajdbcusername) | N/A **(Required)** | Username for the connection e.g. fred | | ---- - - -#### JDBC-based Schema Provider Configs {#JDBC-based-Schema-Provider-Configs} -Configurations for Proto schema provider. - - - -[**Advanced Configs**](#JDBC-based-Schema-Provider-Configs-advanced-configs) - - -| Config Name | Default | Description | Since Version | -| ----------------------------------------------------------------------------------------------------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.proto.class.name](#hoodiedeltastreamerschemaproviderprotoclassname) | N/A **(Required)** | The Protobuf Message class used as the source for the schema. | 0.13.0 | -| [hoodie.deltastreamer.schemaprovider.proto.flatten.wrappers](#hoodiedeltastreamerschemaproviderprotoflattenwrappers) | false (Optional) | When set to true wrapped primitives like Int64Value are translated to a record with a single 'value' field. By default, the value is false and the wrapped primitives are treated as a nullable value | 0.13.0 | -| [hoodie.deltastreamer.schemaprovider.proto.max.recursion.depth](#hoodiedeltastreamerschemaproviderprotomaxrecursiondepth) | 5 (Optional) | The max depth to unravel the Proto schema when translating into an Avro schema. Setting this depth allows the user to convert a schema that is recursive in proto into something that can be represented in their lake format like Parquet. After a given class has been seen N times within a single branch, the schema provider will create a record with a byte array to hold the remaining proto data and a string to hold the message descriptor's name for context. | 0.13.0 | -| [hoodie.deltastreamer.schemaprovider.proto.timestamps.as.records](#hoodiedeltastreamerschemaproviderprototimestampsasrecords) | false (Optional) | When set to true Timestamp fields are translated to a record with a seconds and nanos field. By default, the value is false and the timestamp is converted to a long with the timestamp-micros logical type | 0.13.0 | ---- -#### Schema Post Processor Config Configs {#Schema-Post-Processor-Config-Configs} -Configurations for Schema Post Processor +[**Basic Configs**](#Hudi-Streamer-Configs-basic-configs) +| Config Name | Default | Description | +| ------------------------------------------------------------------------------------- | ------------------ | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| [hoodie.deltastreamer.source.kafka.topic](#hoodiedeltastreamersourcekafkatopic) | N/A **(Required)** | Kafka topic name. The config is specific to HoodieMultiTableDeltaStreamer

`Config Param: KAFKA_TOPIC` | +| [hoodie.deltastreamer.sample.writes.enabled](#hoodiedeltastreamersamplewritesenabled) | false (Optional) | Set this to true to sample from the first batch of records and write to the auxiliary path, before writing to the table.The sampled records are used to calculate the average record size. The relevant write client will have `hoodie.copyonwrite.record.size.estimate` being overwritten by the calculated result.

`Config Param: SAMPLE_WRITES_ENABLED` | +| [hoodie.deltastreamer.sample.writes.size](#hoodiedeltastreamersamplewritessize) | 5000 (Optional) | Number of records to sample from the first write. To improve the estimation's accuracy, for smaller or more compressable record size, set the sample size bigger. For bigger or less compressable record size, set smaller.

`Config Param: SAMPLE_WRITES_SIZE` | -[**Advanced Configs**](#Schema-Post-Processor-Config-Configs-advanced-configs) +[**Advanced Configs**](#Hudi-Streamer-Configs-advanced-configs) -| Config Name | Default | Description | Since Version | -| --------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------ | ----------------------------------------------- | ------------- | -| [hoodie.deltastreamer.schemaprovider.schema_post_processor](#hoodiedeltastreamerschemaproviderschema_post_processor) | N/A **(Required)** | The class name of the schema post processor. | | -| [hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.default](#hoodiedeltastreamerschemaproviderschema_post_processoraddcolumndefault) | N/A **(Required)** | New column's default value | | -| [hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.doc](#hoodiedeltastreamerschemaproviderschema_post_processoraddcolumndoc) | N/A **(Required)** | Docs about new column | | -| [hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.name](#hoodiedeltastreamerschemaproviderschema_post_processoraddcolumnname) | N/A **(Required)** | New column's name | | -| [hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.type](#hoodiedeltastreamerschemaproviderschema_post_processoraddcolumntype) | N/A **(Required)** | New column's type | | -| [hoodie.deltastreamer.schemaprovider.schema_post_processor.delete.columns](#hoodiedeltastreamerschemaproviderschema_post_processordeletecolumns) | N/A **(Required)** | Columns to delete in the schema post processor. | | -| [hoodie.deltastreamer.schemaprovider.schema_post_processor.add.column.nullable](#hoodiedeltastreamerschemaproviderschema_post_processoraddcolumnnullable) | true (Optional) | New column's nullable | | +| Config Name | Default | Description | +| ---------------------------------------------------------------------------------------------------------------------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| [hoodie.deltastreamer.checkpoint.provider.path](#hoodiedeltastreamercheckpointproviderpath) | N/A **(Required)** | The path for providing the checkpoints.

`Config Param: CHECKPOINT_PROVIDER_PATH` | +| [hoodie.deltastreamer.ingestion.tablesToBeIngested](#hoodiedeltastreameringestiontablesToBeIngested) | N/A **(Required)** | Comma separated names of tables to be ingested in the format <database>.<table>, for example db1.table1,db1.table2

`Config Param: TABLES_TO_BE_INGESTED` | +| [hoodie.deltastreamer.ingestion.targetBasePath](#hoodiedeltastreameringestiontargetBasePath) | N/A **(Required)** | The path to which a particular table is ingested. The config is specific to HoodieMultiTableStreamer and overrides path determined using option `--base-path-prefix` for a table. This config is ignored for a single table deltastreamer

`Config Param: TARGET_BASE_PATH` | +| [hoodie.deltastreamer.multiwriter.source.checkpoint.id](#hoodiedeltastreamermultiwritersourcecheckpointid) | N/A **(Required)** | Unique Id to be used for multi-writer deltastreamer scenario. This is the scenario when multiple deltastreamers are used to write to the same target table. If you are just using a single deltastreamer for a table then you do not need to set this config.

`Config Param: MUTLI_WRITER_SOURCE_CHECKPOINT_ID` | +| [hoodie.deltastreamer.source.kafka.append.offsets](#hoodiedeltastreamersourcekafkaappendoffsets) | false (Optional) | When enabled, appends kafka offset info like source offset(_hoodie_kafka_source_offset), partition (_hoodie_kafka_source_partition) and timestamp (_hoodie_kafka_source_timestamp) to the records. By default its disabled and no kafka offsets are added

`Config Param: KAFKA_APPEND_OFFSETS` | +| [hoodie.deltastreamer.source.sanitize.invalid.char.mask](#hoodiedeltastreamersourcesanitizeinvalidcharmask) | __ (Optional) | Defines the character sequence that replaces invalid characters in schema field names if hoodie.deltastreamer.source.sanitize.invalid.schema.field.names is enabled.

`Config Param: SCHEMA_FIELD_NAME_INVALID_CHAR_MASK` | +| [hoodie.deltastreamer.source.sanitize.invalid.schema.field.names](#hoodiedeltastreamersourcesanitizeinvalidschemafieldnames) | false (Optional) | Sanitizes names of invalid schema fields both in the data read from source and also in the schema Replaces invalid characters with hoodie.deltastreamer.source.sanitize.invalid.char.mask. Invalid characters are by goes by avro naming convention (https://avro.apache.org/docs/current/spec.html#names).

`Config Param: SANITIZE_SCHEMA_FIELD_NAMES` | --- diff --git a/website/src/css/custom.css b/website/src/css/custom.css index cd66f10f3c20..a91944297653 100644 --- a/website/src/css/custom.css +++ b/website/src/css/custom.css @@ -351,28 +351,17 @@ h1.blogPostTitle_src-theme-BlogPostItem-styles-module{ } .docs-custom-styles tr td:nth-child(1) { - width: 30%; + width: 20%; word-break: break-all; } .docs-custom-styles tr td:nth-child(2) { - width: 20%; + width: 15%; word-break: break-all; } .docs-custom-styles tr td:nth-child(3) { - width: 45%; -} - -.docs-custom-styles tr td:nth-child(4) { - width: 5%; -} - -.docs-custom-styles h2 { - position: sticky; - top: 80px; - z-index: 11; - background: #fff; + width: 65%; } .docs-custom-styles th { From 3420acad4e54200d2f86ea8bbe17cf9152f8565a Mon Sep 17 00:00:00 2001 From: Y Ethan Guo Date: Wed, 26 Jul 2023 13:39:43 -0700 Subject: [PATCH 2/3] Update website/docs/basic_configurations.md --- website/docs/basic_configurations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/basic_configurations.md b/website/docs/basic_configurations.md index 8aa82bbd2567..abeb077324e3 100644 --- a/website/docs/basic_configurations.md +++ b/website/docs/basic_configurations.md @@ -11,7 +11,7 @@ This page covers the basic configurations you may use to write/read Hudi tables. - [**Flink Sql Configs**](#FLINK_SQL): These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read. - [**Write Client Configs**](#WRITE_CLIENT): Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads. - [**Metastore and Catalog Sync Configs**](#META_SYNC): Configurations used by the Hudi to sync metadata to external metastores and catalogs. -- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of keyHudi stats and metrics. +- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of key Hudi stats and metrics. - [**Kafka Connect Configs**](#KAFKA_CONNECT): These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables - [**Hudi Streamer Configs**](#HUDI_STREAMER): These set of configs are used for Hudi Streamer utility which provides the way to ingest from different sources such as DFS or Kafka. From 63ae77e63c878efc019ede29bb05d5ee7e893383 Mon Sep 17 00:00:00 2001 From: Y Ethan Guo Date: Wed, 26 Jul 2023 13:40:23 -0700 Subject: [PATCH 3/3] Update configurations.md --- website/docs/configurations.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/docs/configurations.md b/website/docs/configurations.md index 41c5752e8f2a..6040efe515e7 100644 --- a/website/docs/configurations.md +++ b/website/docs/configurations.md @@ -24,7 +24,7 @@ By default, Hudi would load the configuration file under `/etc/hudi/conf` direct - [**Flink Sql Configs**](#FLINK_SQL): These configs control the Hudi Flink SQL source/sink connectors, providing ability to define record keys, pick out the write operation, specify how to merge records, enable/disable asynchronous compaction or choosing query type to read. - [**Write Client Configs**](#WRITE_CLIENT): Internally, the Hudi datasource uses a RDD based HoodieWriteClient API to actually perform writes to storage. These configs provide deep control over lower level aspects like file sizing, compression, parallelism, compaction, write schema, cleaning etc. Although Hudi provides sane defaults, from time-time these configs may need to be tweaked to optimize for specific workloads. - [**Metastore and Catalog Sync Configs**](#META_SYNC): Configurations used by the Hudi to sync metadata to external metastores and catalogs. -- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of keyHudi stats and metrics. +- [**Metrics Configs**](#METRICS): These set of configs are used to enable monitoring and reporting of key Hudi stats and metrics. - [**Record Payload Config**](#RECORD_PAYLOAD): This is the lowest level of customization offered by Hudi. Record payloads define how to produce new values to upsert based on incoming new record and stored old record. Hudi provides default implementations such as OverwriteWithLatestAvroPayload which simply update table with the latest/last-written record. This can be overridden to a custom class extending HoodieRecordPayload class, on both datasource and WriteClient levels. - [**Kafka Connect Configs**](#KAFKA_CONNECT): These set of configs are used for Kafka Connect Sink Connector for writing Hudi Tables - [**Amazon Web Services Configs**](#AWS): Configurations specific to Amazon Web Services.