-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Kernel] Support for loading protocol and metadata from checksum file…
… in DeltaLog Initial read current version CRC w test to verify wip end-2-end working + tests Co-authored-by: Allison Portis <[email protected]> Co-authored-by: Venki Korukanti <[email protected]>
- Loading branch information
1 parent
2c444a2
commit 069a1f9
Showing
32 changed files
with
519 additions
and
76 deletions.
There are no files selected for viewing
133 changes: 133 additions & 0 deletions
133
kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/ChecksumReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
/* | ||
* Copyright (2024) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.delta.kernel.internal.replay; | ||
|
||
import java.util.*; | ||
|
||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
import io.delta.kernel.data.ColumnarBatch; | ||
import io.delta.kernel.engine.Engine; | ||
import io.delta.kernel.utils.CloseableIterator; | ||
import io.delta.kernel.utils.FileStatus; | ||
|
||
import io.delta.kernel.internal.fs.Path; | ||
import io.delta.kernel.internal.util.FileNames; | ||
import static io.delta.kernel.internal.replay.VersionStats.fromColumnarBatch; | ||
import static io.delta.kernel.internal.util.FileNames.checksumFile; | ||
import static io.delta.kernel.internal.util.FileNames.isChecksumFile; | ||
import static io.delta.kernel.internal.util.Utils.singletonCloseableIterator; | ||
|
||
/** | ||
* Utility method to load protocol and metadata from the Delta log checksum files. | ||
*/ | ||
public class ChecksumReader { | ||
private static final Logger logger = LoggerFactory.getLogger(ChecksumReader.class); | ||
|
||
/** | ||
* Load the protocol and metadata from the checksum file at the given version. If the checksum | ||
* file is not found at the given version, it will try to find the latest checksum file that is | ||
* created after the lower bound version or within the last 100 versions. | ||
* | ||
* @param engine the engine to use for reading the checksum file | ||
* @param logPath the path to the Delta log | ||
* @param readVersion the version to read the checksum file from | ||
* @param lowerBoundOpt the lower bound version to search for the checksum file | ||
* @return Optional {@link VersionStats} containing the protocol and metadata, and the version | ||
* of the checksum file. If the checksum file is not found, it will return an empty | ||
*/ | ||
public static Optional<VersionStats> getVersionStats( | ||
Engine engine, | ||
Path logPath, | ||
long readVersion, | ||
Optional<Long> lowerBoundOpt) { | ||
|
||
// First try to load the CRC at given version. If not found or failed to read then try to | ||
// find the latest CRC file that is created after the lower bound version or within the | ||
// last 100 versions. | ||
Path crcFilePath = checksumFile(logPath, readVersion); | ||
Optional<VersionStats> versionStatsOpt = readChecksumFile(engine, crcFilePath); | ||
if (versionStatsOpt.isPresent() || | ||
// we don't expect any more checksum files as it is the first version | ||
readVersion == 0) { | ||
return versionStatsOpt; | ||
} | ||
|
||
// Try to list the last 100 CRC files and see if we can find a CRC that we can use | ||
long lowerBound = Math.max( | ||
lowerBoundOpt.orElse(0L) + 1, | ||
Math.max(0, readVersion - 100)); | ||
|
||
Path listFrom = checksumFile(logPath, lowerBound); | ||
try (CloseableIterator<FileStatus> crcFiles = | ||
engine.getFileSystemClient().listFrom(listFrom.toString())) { | ||
|
||
List<FileStatus> crcFilesList = new ArrayList<>(); | ||
crcFiles.filter(file -> isChecksumFile(new Path(file.getPath()))) | ||
.forEachRemaining(crcFilesList::add); | ||
|
||
// pick the last file which is the latest version that has the CRC file | ||
if (crcFilesList.isEmpty()) { | ||
logger.warn("No checksum files found in the range {} to {}", lowerBound, | ||
readVersion); | ||
return Optional.empty(); | ||
} | ||
|
||
FileStatus latestCRCFile = crcFilesList.get(crcFilesList.size() - 1); | ||
return readChecksumFile(engine, new Path(latestCRCFile.getPath())); | ||
} catch (Exception e) { | ||
logger.warn("Failed to list checksum files from {}", listFrom, e); | ||
return Optional.empty(); | ||
} | ||
|
||
} | ||
|
||
private static Optional<VersionStats> readChecksumFile(Engine engine, Path filePath) { | ||
try (CloseableIterator<ColumnarBatch> iter = engine.getJsonHandler() | ||
.readJsonFiles( | ||
singletonCloseableIterator(FileStatus.of(filePath.toString())), | ||
VersionStats.FULL_SCHEMA, | ||
Optional.empty())) { | ||
// We do this instead of iterating through the rows or using `getSingularRow` so we | ||
// can use the existing fromColumnVector methods in Protocol, Metadata, Format etc | ||
if (!iter.hasNext()) { | ||
logger.warn("Checksum file is empty: {}", filePath); | ||
return Optional.empty(); | ||
} | ||
|
||
ColumnarBatch batch = iter.next(); | ||
if (batch.getSize() != 1) { | ||
String msg = "Expected exactly one row in the checksum file {}, found {} rows"; | ||
logger.warn(msg, filePath, batch.getSize()); | ||
return Optional.empty(); | ||
} | ||
|
||
long crcVersion = FileNames.checksumVersion(filePath); | ||
|
||
VersionStats versionStats = fromColumnarBatch(engine, crcVersion, batch, 0 /* rowId */); | ||
if (versionStats.getMetadata() == null || versionStats.getProtocol() == null) { | ||
logger.warn("Invalid checksum file missing protocol and/or metadata: {}", filePath); | ||
return Optional.empty(); | ||
} | ||
return Optional.of(versionStats); | ||
} catch (Exception e) { | ||
// This can happen when the version does not have a checksum file | ||
logger.warn("Failed to read checksum file {}", filePath, e); | ||
return Optional.empty(); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
74 changes: 74 additions & 0 deletions
74
kernel/kernel-api/src/main/java/io/delta/kernel/internal/replay/VersionStats.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
/* | ||
* Copyright (2024) The Delta Lake Project Authors. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
package io.delta.kernel.internal.replay; | ||
|
||
import io.delta.kernel.data.ColumnarBatch; | ||
import io.delta.kernel.engine.Engine; | ||
import io.delta.kernel.types.StructType; | ||
import io.delta.kernel.internal.actions.Metadata; | ||
import io.delta.kernel.internal.actions.Protocol; | ||
|
||
public class VersionStats { | ||
|
||
public static VersionStats fromColumnarBatch( | ||
Engine engine, long version, ColumnarBatch batch, int rowId) { | ||
// fromColumnVector already takes care of nulls | ||
Protocol protocol = Protocol.fromColumnVector( | ||
batch.getColumnVector(PROTOCOL_ORDINAL), rowId); | ||
Metadata metadata = Metadata.fromColumnVector( | ||
batch.getColumnVector(METADATA_ORDINAL), rowId, engine); | ||
return new VersionStats(version, metadata, protocol); | ||
} | ||
|
||
// We can add additional fields later | ||
public static final StructType FULL_SCHEMA = new StructType() | ||
.add("protocol", Protocol.FULL_SCHEMA) | ||
.add("metadata", Metadata.FULL_SCHEMA); | ||
|
||
private static final int PROTOCOL_ORDINAL = 0; | ||
private static final int METADATA_ORDINAL = 1; | ||
|
||
private final long version; | ||
private final Metadata metadata; | ||
private final Protocol protocol; | ||
|
||
protected VersionStats(long version, Metadata metadata, Protocol protocol) { | ||
this.version = version; | ||
this.metadata = metadata; | ||
this.protocol = protocol; | ||
} | ||
|
||
/** | ||
* The version of the Delta table that this VersionStats represents. | ||
*/ | ||
public long getVersion() { | ||
return version; | ||
} | ||
|
||
/** | ||
* The {@link Metadata} stored in this VersionStats. May be null. | ||
*/ | ||
public Metadata getMetadata() { | ||
return metadata; | ||
} | ||
|
||
/** | ||
* The {@link Protocol} stored in this VersionStats. May be null. | ||
*/ | ||
public Protocol getProtocol() { | ||
return protocol; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+1.04 KB
...ta/birthday=2020-01-01/cdc-00000-993231ce-e967-4842-81ec-0bc894621351.c000.snappy.parquet
Binary file not shown.
Binary file added
BIN
+1.04 KB
...ta/birthday=2020-01-01/cdc-00000-cb387749-d66a-4f65-8d62-fc008bfb7b57.c000.snappy.parquet
Binary file not shown.
1 change: 1 addition & 0 deletions
1
...nel-defaults/src/test/resources/stream_table_optimize/_delta_log/00000000000000000000.crc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"tableSizeBytes":0,"numFiles":0,"numMetadata":1,"numProtocol":1,"protocol":{"minReaderVersion":1,"minWriterVersion":4},"metadata":{"id":"332a246e-eea2-43a5-8422-b65526c6cbe9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"age\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"birthday\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["birthday"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1664214549691},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"txnId":"eabe3042-b6e7-4710-aa7f-70ab5cfac6cb"} |
3 changes: 3 additions & 0 deletions
3
...el-defaults/src/test/resources/stream_table_optimize/_delta_log/00000000000000000000.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{"commitInfo":{"timestamp":1664214549848,"userId":"7953272455820895","userName":"[email protected]","operation":"CREATE TABLE","operationParameters":{"isManaged":"true","description":null,"partitionBy":"[\"birthday\"]","properties":"{\"delta.enableChangeDataFeed\":\"true\"}"},"notebook":{"notebookId":"3194333061073108"},"clusterId":"0819-204509-hill72","isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{},"engineInfo":"Databricks-Runtime/11.x-snapshot-aarch64-scala2.12","txnId":"eabe3042-b6e7-4710-aa7f-70ab5cfac6cb"}} | ||
{"protocol":{"minReaderVersion":1,"minWriterVersion":4}} | ||
{"metaData":{"id":"332a246e-eea2-43a5-8422-b65526c6cbe9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"age\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"birthday\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["birthday"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1664214549691}} |
1 change: 1 addition & 0 deletions
1
...nel-defaults/src/test/resources/stream_table_optimize/_delta_log/00000000000000000001.crc
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
{"tableSizeBytes":791,"numFiles":1,"numMetadata":1,"numProtocol":1,"protocol":{"minReaderVersion":1,"minWriterVersion":4},"metadata":{"id":"332a246e-eea2-43a5-8422-b65526c6cbe9","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"name\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"age\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"birthday\",\"type\":\"date\",\"nullable\":true,\"metadata\":{}}]}","partitionColumns":["birthday"],"configuration":{"delta.enableChangeDataFeed":"true"},"createdTime":1664214549691},"histogramOpt":{"sortedBinBoundaries":[0,8192,16384,32768,65536,131072,262144,524288,1048576,2097152,4194304,8388608,12582912,16777216,20971520,25165824,29360128,33554432,37748736,41943040,50331648,58720256,67108864,75497472,83886080,92274688,100663296,109051904,117440512,125829120,130023424,134217728,138412032,142606336,146800640,150994944,167772160,184549376,201326592,218103808,234881024,251658240,268435456,285212672,301989888,318767104,335544320,352321536,369098752,385875968,402653184,419430400,436207616,452984832,469762048,486539264,503316480,520093696,536870912,553648128,570425344,587202560,603979776,671088640,738197504,805306368,872415232,939524096,1006632960,1073741824,1140850688,1207959552,1275068416,1342177280,1409286144,1476395008,1610612736,1744830464,1879048192,2013265920,2147483648,2415919104,2684354560,2952790016,3221225472,3489660928,3758096384,4026531840,4294967296,8589934592,17179869184,34359738368,68719476736,137438953472,274877906944],"fileCounts":[1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],"totalBytes":[791,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]},"txnId":"f7b5a77a-f86d-40b6-b266-5da97aecbe6e"} |
2 changes: 2 additions & 0 deletions
2
...el-defaults/src/test/resources/stream_table_optimize/_delta_log/00000000000000000001.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
{"commitInfo":{"timestamp":1664214552033,"userId":"7953272455820895","userName":"[email protected]","operation":"WRITE","operationParameters":{"mode":"Append","partitionBy":"[]"},"notebook":{"notebookId":"3194333061073108"},"clusterId":"0819-204509-hill72","readVersion":0,"isolationLevel":"WriteSerializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"1","numOutputBytes":"791"},"engineInfo":"Databricks-Runtime/11.x-snapshot-aarch64-scala2.12","txnId":"f7b5a77a-f86d-40b6-b266-5da97aecbe6e"}} | ||
{"add":{"path":"birthday=2020-01-01/part-00000-21994048-f0e7-4655-a705-09e597411943.c000.snappy.parquet","partitionValues":{"birthday":"2020-01-01"},"size":791,"modificationTime":1664214552000,"dataChange":true,"stats":"{\"numRecords\":1,\"minValues\":{\"name\":\"1\",\"age\":1},\"maxValues\":{\"name\":\"1\",\"age\":1},\"nullCount\":{\"name\":0,\"age\":0}}","tags":{"INSERTION_TIME":"1664214552000000","MIN_INSERTION_TIME":"1664214552000000","MAX_INSERTION_TIME":"1664214552000000","OPTIMIZE_TARGET_SIZE":"268435456"}}} |
Oops, something went wrong.