-
Notifications
You must be signed in to change notification settings - Fork 2.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Use SupportsPrefixOperations for Remove OrphanFile Procedure #11906
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ | |
import java.util.function.Consumer; | ||
import java.util.function.Predicate; | ||
import java.util.stream.Collectors; | ||
import java.util.stream.Stream; | ||
import org.apache.hadoop.conf.Configuration; | ||
import org.apache.hadoop.fs.FileStatus; | ||
import org.apache.hadoop.fs.FileSystem; | ||
|
@@ -50,6 +51,7 @@ | |
import org.apache.iceberg.hadoop.HiddenPathFilter; | ||
import org.apache.iceberg.io.BulkDeletionFailureException; | ||
import org.apache.iceberg.io.SupportsBulkOperations; | ||
import org.apache.iceberg.io.SupportsPrefixOperations; | ||
import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; | ||
import org.apache.iceberg.relocated.com.google.common.base.Preconditions; | ||
import org.apache.iceberg.relocated.com.google.common.base.Strings; | ||
|
@@ -292,19 +294,49 @@ private Dataset<FileURI> validFileIdentDS() { | |
|
||
private Dataset<FileURI> actualFileIdentDS() { | ||
StringToFileURI toFileURI = new StringToFileURI(equalSchemes, equalAuthorities); | ||
Dataset<String> dataList; | ||
if (compareToFileList == null) { | ||
return toFileURI.apply(listedFileDS()); | ||
dataList = | ||
table.io() instanceof SupportsPrefixOperations ? listWithPrefix() : listWithoutPrefix(); | ||
} else { | ||
return toFileURI.apply(filteredCompareToFileList()); | ||
dataList = filteredCompareToFileList(); | ||
} | ||
|
||
return toFileURI.apply(dataList); | ||
} | ||
|
||
@VisibleForTesting | ||
Dataset<String> listWithPrefix() { | ||
List<String> matchingFiles = Lists.newArrayList(); | ||
// listPrefix only returns files. so we additionally need to check parent folders for each file | ||
// in following example file itself is not filtered out, | ||
// but it should be excluded due to its parent folder: `_c2_trunc` | ||
// "/data/_c2_trunc/file.txt" | ||
PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs(), true); | ||
|
||
Iterator<org.apache.iceberg.io.FileInfo> iterator = | ||
((SupportsPrefixOperations) table.io()).listPrefix(location).iterator(); | ||
while (iterator.hasNext()) { | ||
org.apache.iceberg.io.FileInfo fileInfo = iterator.next(); | ||
// NOTE: check the path relative to table location. To avoid checking un necessary root | ||
// folders | ||
Path relativeFilePath = new Path(fileInfo.location().replace(location, "")); | ||
Comment on lines
+321
to
+323
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. creating relative path to avoid checking parent folders of the table. however this |
||
if (fileInfo.createdAtMillis() < olderThanTimestamp && pathFilter.accept(relativeFilePath)) { | ||
matchingFiles.add(fileInfo.location()); | ||
} | ||
} | ||
JavaRDD<String> matchingFileRDD = sparkContext().parallelize(matchingFiles, 1); | ||
return spark().createDataset(matchingFileRDD.rdd(), Encoders.STRING()); | ||
} | ||
|
||
private Dataset<String> listedFileDS() { | ||
@VisibleForTesting | ||
Dataset<String> listWithoutPrefix() { | ||
ismailsimsek marked this conversation as resolved.
Show resolved
Hide resolved
|
||
List<String> subDirs = Lists.newArrayList(); | ||
List<String> matchingFiles = Lists.newArrayList(); | ||
|
||
Predicate<FileStatus> predicate = file -> file.getModificationTime() < olderThanTimestamp; | ||
PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs()); | ||
// don't check parent folders because it's already checked by recursive call | ||
PathFilter pathFilter = PartitionAwareHiddenPathFilter.forSpecs(table.specs(), false); | ||
|
||
// list at most MAX_DRIVER_LISTING_DEPTH levels and only dirs that have | ||
// less than MAX_DRIVER_LISTING_DIRECT_SUB_DIRS direct sub dirs on the driver | ||
|
@@ -330,7 +362,6 @@ private Dataset<String> listedFileDS() { | |
Broadcast<SerializableConfiguration> conf = sparkContext().broadcast(hadoopConf); | ||
ListDirsRecursively listDirs = new ListDirsRecursively(conf, olderThanTimestamp, pathFilter); | ||
JavaRDD<String> matchingLeafFileRDD = subDirRDD.mapPartitions(listDirs); | ||
|
||
JavaRDD<String> completeMatchingFileRDD = matchingFileRDD.union(matchingLeafFileRDD); | ||
return spark().createDataset(completeMatchingFileRDD.rdd(), Encoders.STRING()); | ||
} | ||
|
@@ -589,21 +620,42 @@ private FileURI toFileURI(I input) { | |
static class PartitionAwareHiddenPathFilter implements PathFilter, Serializable { | ||
|
||
private final Set<String> hiddenPathPartitionNames; | ||
private final boolean checkParents; | ||
|
||
PartitionAwareHiddenPathFilter(Set<String> hiddenPathPartitionNames) { | ||
PartitionAwareHiddenPathFilter(Set<String> hiddenPathPartitionNames, boolean checkParents) { | ||
this.hiddenPathPartitionNames = hiddenPathPartitionNames; | ||
this.checkParents = checkParents; | ||
} | ||
|
||
@Override | ||
public boolean accept(Path path) { | ||
if (!checkParents) { | ||
return doAccept(path); | ||
} | ||
|
||
// if any of the parent folders is not accepted then return false | ||
return doAccept(path) && !hasHiddenPttParentFolder(path); | ||
} | ||
|
||
private boolean doAccept(Path path) { | ||
return isHiddenPartitionPath(path) || HiddenPathFilter.get().accept(path); | ||
} | ||
|
||
/** | ||
* Iterates through the parent folders if any of the parent folders of the given path is a | ||
* hidden partition folder. | ||
*/ | ||
public boolean hasHiddenPttParentFolder(Path path) { | ||
return Stream.iterate(path, Path::getParent) | ||
.takeWhile(Objects::nonNull) | ||
.anyMatch(parentPath -> !doAccept(parentPath)); | ||
} | ||
Comment on lines
+648
to
+652
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Now it will check parent folders per file, to ensure none of the parent folder is hiddenpartition folder. this might be less performant for large list, if performance is a concern. |
||
|
||
private boolean isHiddenPartitionPath(Path path) { | ||
return hiddenPathPartitionNames.stream().anyMatch(path.getName()::startsWith); | ||
} | ||
|
||
static PathFilter forSpecs(Map<Integer, PartitionSpec> specs) { | ||
static PathFilter forSpecs(Map<Integer, PartitionSpec> specs, boolean checkParents) { | ||
if (specs == null) { | ||
return HiddenPathFilter.get(); | ||
} | ||
|
@@ -619,7 +671,7 @@ static PathFilter forSpecs(Map<Integer, PartitionSpec> specs) { | |
if (partitionNames.isEmpty()) { | ||
return HiddenPathFilter.get(); | ||
} else { | ||
return new PartitionAwareHiddenPathFilter(partitionNames); | ||
return new PartitionAwareHiddenPathFilter(partitionNames, checkParents); | ||
} | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We need a way to break up the key space, possibly by taking hints from what
LocationProvider
is configured for the table. A single listing is not scalable.