Skip to content

Commit

Permalink
Uniform iceberg conversion transaction should not convert commit with…
Browse files Browse the repository at this point in the history
… only AddFiles without datachange - 3.1 (#3619)

<!--
Thanks for sending a pull request!  Here are some tips for you:
1. If this is your first time, please read our contributor guidelines:
https://github.com/delta-io/delta/blob/master/CONTRIBUTING.md
2. If the PR is unfinished, add '[WIP]' in your PR title, e.g., '[WIP]
Your PR title ...'.
  3. Be sure to keep the PR description updated to reflect all changes.
  4. Please write your PR title to summarize what this PR proposes.
5. If possible, provide a concise example to reproduce the issue for a
faster review.
6. If applicable, include the corresponding issue number in the PR title
and link it in the body.
-->

#### Which Delta project/connector is this regarding?
<!--
Please add the component selected below to the beginning of the pull
request title
For example: [Spark] Title of my pull request
-->

- [x] Spark
- [ ] Standalone
- [ ] Flink
- [ ] Kernel
- [ ] Other (fill in here)

## Description

Uniform iceberg conversion transaction should not convert commit with
only AddFiles without datachange. Otherwise it will result in duplicate
AddFile in Iceberg.
lzlfred authored Aug 28, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
1 parent 10ed75a commit dd9d5d8
Showing 1 changed file with 12 additions and 4 deletions.
Original file line number Diff line number Diff line change
@@ -361,6 +361,7 @@ class IcebergConverter(spark: SparkSession)
var hasRemoves = false
var hasDataChange = false
var hasCommitInfo = false
var commitInfo: Option[CommitInfo] = None
breakable {
for (action <- actionsToCommit) {
action match {
@@ -370,7 +371,9 @@ class IcebergConverter(spark: SparkSession)
case r: RemoveFile =>
hasRemoves = true
if (r.dataChange) hasDataChange = true
case _: CommitInfo => hasCommitInfo = true
case ci: CommitInfo =>
commitInfo = Some(ci)
hasCommitInfo = true
case _ => // Do nothing
}
if (hasAdds && hasRemoves && hasDataChange && hasCommitInfo) break // Short-circuit
@@ -404,9 +407,14 @@ class IcebergConverter(spark: SparkSession)
}
overwriteHelper.commit()
} else if (hasAdds) {
val appendHelper = icebergTxn.getAppendOnlyHelper()
addsAndRemoves.foreach(action => appendHelper.add(action.add))
appendHelper.commit()
if (!hasRemoves && !hasDataChange && allDeltaActionsCaptured) {
logInfo(s"Skip Iceberg conversion for commit that only has AddFiles " +
s"without any RemoveFiles or data change. CommitInfo: $commitInfo")
} else {
val appendHelper = icebergTxn.getAppendOnlyHelper()
addsAndRemoves.foreach(action => appendHelper.add(action.add))
appendHelper.commit()
}
} else if (hasRemoves) {
val removeHelper = icebergTxn.getRemoveOnlyHelper()
addsAndRemoves.foreach(action => removeHelper.remove(action.remove))

0 comments on commit dd9d5d8

Please sign in to comment.