Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Catalog: return Iceberg snapshot log based on Nessie commit history #10033

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ as necessary. Empty sections will not end in the release notes.

### Highlights

- Nessie now returns the snapshot history in the `snapshots` and `snapshot-log` attributes of an Iceberg
table-metadata retrieved via Iceberg REST for table changes that have been committed via Iceberg REST
to Nessie 0.101.0 or newer. Commits made using older Nessie versions will not return older snapshots.
- Generally, not only for Nessie, It is recommended to keep the number of snapshots maintained in an
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
- Generally, not only for Nessie, It is recommended to keep the number of snapshots maintained in an
- Generally, not only for Nessie, it is recommended to keep the number of snapshots maintained in an

Iceberg table-metadata as low as possible. Use the maintenance operations provided by Iceberg.

### Upgrade notes

### Breaking changes
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,12 +177,62 @@ public static IcebergTableMetadata.Builder tableMetadataSimple() {
.putSummary("operation", "testing")
.sequenceNumber(123L)
.timestampMs(12345678L)
.manifestList("s3://this-does-not-exist/anywhere/")
.build())
.putRef("main", IcebergSnapshotRef.builder().type("branch").snapshotId(11).build())
.addSnapshotLog(
IcebergSnapshotLogEntry.builder().snapshotId(11).timestampMs(12345678L).build());
}

public static IcebergTableMetadata.Builder tableMetadataThreeSnapshots() {
IcebergSchema schemaAllTypes = icebergSchemaAllTypes();

return IcebergTableMetadata.builder()
.tableUuid(UUID.randomUUID().toString())
.lastUpdatedMs(111111111L)
.location("table-location")
.currentSnapshotId(13)
.lastColumnId(schemaAllTypes.fields().get(schemaAllTypes.fields().size() - 1).id())
.lastPartitionId(INITIAL_PARTITION_ID)
.lastSequenceNumber(INITIAL_SEQUENCE_NUMBER)
.currentSchemaId(schemaAllTypes.schemaId())
.defaultSortOrderId(INITIAL_SORT_ORDER_ID)
.defaultSpecId(INITIAL_SPEC_ID)
.putProperty("prop", "value")
.addSchemas(schemaAllTypes)
.addSnapshots(
IcebergSnapshot.builder()
.snapshotId(11)
.schemaId(schemaAllTypes.schemaId())
.putSummary("operation", "testing1")
.sequenceNumber(123L)
.timestampMs(12345676L)
.build())
.addSnapshots(
IcebergSnapshot.builder()
.snapshotId(12)
.schemaId(schemaAllTypes.schemaId())
.putSummary("operation", "testing2")
.sequenceNumber(124L)
.timestampMs(12345677L)
.build())
.addSnapshots(
IcebergSnapshot.builder()
.snapshotId(13)
.schemaId(schemaAllTypes.schemaId())
.putSummary("operation", "testing3")
.sequenceNumber(125L)
.timestampMs(12345678L)
.build())
.putRef("main", IcebergSnapshotRef.builder().type("branch").snapshotId(13).build())
.addSnapshotLog(
IcebergSnapshotLogEntry.builder().snapshotId(11).timestampMs(12345676L).build())
.addSnapshotLog(
IcebergSnapshotLogEntry.builder().snapshotId(12).timestampMs(12345677L).build())
.addSnapshotLog(
IcebergSnapshotLogEntry.builder().snapshotId(13).timestampMs(12345678L).build());
}

public static IcebergViewMetadata.Builder viewMetadataSimple() {
IcebergSchema schemaAllTypes = icebergSchemaAllTypes();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ public void close() throws IOException {
};
UUID commitId = randomUUID();
long snapshotId = 1;
long sequenceNumber = 1;
long sequenceNumber = 1000;
long timestamp = 1715175169320L;
IcebergManifestFileGenerator manifestFileGenerator =
IcebergManifestFileGenerator.builder()
Expand Down Expand Up @@ -210,8 +210,8 @@ public void close() throws IOException {
.currentSnapshotId(snapshotId)
.defaultSpecId(schemaGenerator.getIcebergPartitionSpec().specId())
.defaultSortOrderId(IcebergSortOrder.UNSORTED_ORDER.orderId())
.lastSequenceNumber(1L)
.snapshots(singletonList(snapshotWithManifestList))
.lastSequenceNumber(1000L)
.addSnapshot(snapshotWithManifestList)
.build();
metadataConsumer.accept(icebergMetadataWithManifestList);
return writer.write(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

import static com.google.common.base.Preconditions.checkState;

import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonView;
Expand All @@ -30,6 +31,8 @@
import javax.annotation.Nullable;
import org.immutables.value.Value;
import org.projectnessie.catalog.formats.iceberg.IcebergSpec;
import org.projectnessie.catalog.model.snapshot.ImmutableImplicitIcebergSnapshot;
import org.projectnessie.catalog.model.snapshot.ImplicitIcebergSnapshot;
import org.projectnessie.nessie.immutables.NessieImmutable;

@NessieImmutable
Expand Down Expand Up @@ -62,6 +65,31 @@ static IcebergSnapshot snapshot(
schemaId);
}

static IcebergSnapshot fromImplicitIcebergSnapshot(
ImplicitIcebergSnapshot implicit, Long parentSnapshotId) {
return ImmutableIcebergSnapshot.of(
implicit.sequenceNumber(),
implicit.snapshotId(),
parentSnapshotId,
implicit.timestampMs(),
implicit.summary(),
implicit.manifests(),
implicit.manifestList(),
implicit.schemaId());
}

@JsonIgnore
default ImplicitIcebergSnapshot asImplicitIcebergSnapshot() {
return ImmutableImplicitIcebergSnapshot.of(
snapshotId(),
sequenceNumber(),
timestampMs(),
summary(),
manifests(),
manifestList(),
schemaId());
}

@Nullable
@jakarta.annotation.Nullable
@JsonInclude(JsonInclude.Include.NON_NULL)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ public enum CatalogOps {
META_SET_SNAPSHOT_REF,
META_REMOVE_SNAPSHOT_REF,
META_UPGRADE_FORMAT_VERSION,
META_REMOVE_SNAPSHOTS,

// Catalog operations
CATALOG_CREATE_ENTITY,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
import static java.util.Collections.emptyMap;

import java.time.Instant;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.HashSet;
import java.util.List;
Expand Down Expand Up @@ -56,7 +55,7 @@ public class IcebergTableMetadataUpdateState {
private int lastAddedSchemaId = -1;
private int lastAddedSpecId = -1;
private int lastAddedOrderId = -1;
private final List<IcebergSnapshot> addedSnapshots = new ArrayList<>();
private IcebergSnapshot previouslyAddedSnapshot = null;
private final Set<Integer> addedSchemaIds = new HashSet<>();
private final Set<Integer> addedSpecIds = new HashSet<>();
private final Set<Integer> addedOrderIds = new HashSet<>();
Expand Down Expand Up @@ -86,12 +85,12 @@ public NessieTableSnapshot snapshot() {
return snapshot;
}

public List<IcebergSnapshot> addedSnapshots() {
return addedSnapshots;
public IcebergSnapshot previouslyAddedSnapshot() {
return previouslyAddedSnapshot;
}

public void snapshotAdded(IcebergSnapshot snapshot) {
addedSnapshots.add(snapshot);
previouslyAddedSnapshot = snapshot;
}

public int lastAddedSchemaId() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@
import org.projectnessie.catalog.model.schema.types.NessieTimestampTypeSpec;
import org.projectnessie.catalog.model.schema.types.NessieType;
import org.projectnessie.catalog.model.schema.types.NessieTypeSpec;
import org.projectnessie.catalog.model.snapshot.ImmutableImplicitIcebergSnapshot;
import org.projectnessie.catalog.model.snapshot.ImplicitIcebergSnapshot;
import org.projectnessie.catalog.model.snapshot.NessieEntitySnapshot;
import org.projectnessie.catalog.model.snapshot.NessieTableSnapshot;
import org.projectnessie.catalog.model.snapshot.NessieViewRepresentation;
Expand Down Expand Up @@ -512,7 +514,7 @@ public static NessieTableSnapshot icebergTableSnapshotToNessie(
Function<IcebergSnapshot, String> manifestListLocation) {
NessieTableSnapshot.Builder snapshot = NessieTableSnapshot.builder().id(snapshotId);
if (previous != null) {
snapshot.from(previous);
snapshot.from(previous).implicitIcebergSnapshots(Map.of());

String previousLocation = previous.icebergLocation();
if (previousLocation != null && !previousLocation.equals(iceberg.location())) {
Expand Down Expand Up @@ -623,10 +625,6 @@ public static NessieTableSnapshot icebergTableSnapshotToNessie(
.ifPresent(
currentSnapshot -> {
snapshot.icebergSnapshotId(currentSnapshot.snapshotId());
currentSnapshot
.parentSnapshotId(); // TODO Can we leave this unset, as we do not return previous
// Iceberg
// snapshots??
Integer schemaId = currentSnapshot.schemaId();
if (schemaId != null) {
// TODO this overwrites the "current schema ID" with the schema ID of the current
Expand Down Expand Up @@ -654,10 +652,22 @@ public static NessieTableSnapshot icebergTableSnapshotToNessie(
currentSnapshot.manifests(); // TODO
});

for (IcebergSnapshotLogEntry logEntry : iceberg.snapshotLog()) {
// TODO ??
logEntry.snapshotId();
logEntry.timestampMs();
for (var icebergSnapshotLogEntry : iceberg.snapshotLog()) {
var snapId = icebergSnapshotLogEntry.snapshotId();
if (snapId == iceberg.currentSnapshotId()) {
continue;
}
if (previous == null) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is previous ever non-null? IJ shows all non-test call paths leading to null 🤔

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Those calls come from importIcebergTable... So, essentially if someone commits via Nessie API, each NessieTableSnapshot (after importing) will have duplicate "implicit" snapshot lists. WDYT?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes - unfortunate.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why keep previous if it is always null in production call paths?

var icebergSnap = iceberg.snapshotById(snapId);
if (icebergSnap.isEmpty()) {
throw new IllegalArgumentException(
"Invalid Iceberg table-metadata: snapshot-log references snapshot with ID "
+ snapId
+ ", which is not present as a snapshot object");
}
snapshot.putImplicitIcebergSnapshot(snapId, icebergSnap.get().asImplicitIcebergSnapshot());
}
snapshot.addPreviousIcebergSnapshotId(snapId);
}

for (IcebergStatisticsFile statisticsFile : iceberg.statistics()) {
Expand Down Expand Up @@ -943,6 +953,7 @@ public static IcebergViewMetadata nessieViewSnapshotToIceberg(

public static IcebergTableMetadata nessieTableSnapshotToIceberg(
NessieTableSnapshot nessie,
List<ImplicitIcebergSnapshot> history,
Optional<IcebergSpec> requestedSpecVersion,
Consumer<Map<String, String>> tablePropertiesTweak) {
NessieTable entity = nessie.entity();
Expand Down Expand Up @@ -1031,6 +1042,19 @@ public static IcebergTableMetadata nessieTableSnapshotToIceberg(
? nessie.icebergManifestFileLocations()
: emptyList();

var parentSnapshotId = (Long) null;

for (ImplicitIcebergSnapshot previous : history) {
var snap = IcebergSnapshot.fromImplicitIcebergSnapshot(previous, parentSnapshotId);
metadata.addSnapshot(snap);
parentSnapshotId = snap.snapshotId();
metadata.addSnapshotLog(
IcebergSnapshotLogEntry.snapshotLogEntry(
previous.timestampMs(), previous.snapshotId()));
// TODO we don't include the metadata location yet - we could potentially do that later
// metadata.addMetadataLog(IcebergHistoryEntry.historyEntry(previousSnap.timestampMs(), ));
}

IcebergSnapshot.Builder snapshot =
IcebergSnapshot.builder()
.snapshotId(snapshotId)
Expand All @@ -1039,7 +1063,8 @@ public static IcebergTableMetadata nessieTableSnapshotToIceberg(
.manifestList(manifestListLocation)
.summary(nessie.icebergSnapshotSummary())
.timestampMs(timestampMs)
.sequenceNumber(nessie.icebergSnapshotSequenceNumber());
.sequenceNumber(nessie.icebergSnapshotSequenceNumber())
.parentSnapshotId(parentSnapshotId);
metadata.addSnapshots(snapshot.build());

metadata.putRef(
Expand Down Expand Up @@ -1080,9 +1105,6 @@ public static IcebergTableMetadata nessieTableSnapshotToIceberg(
partitionStatisticsFile.fileSizeInBytes()));
}

// metadata.addMetadataLog();
// metadata.addSnapshotLog();

return metadata.build();
}

Expand Down Expand Up @@ -1577,13 +1599,18 @@ public static void addSnapshot(AddSnapshot u, IcebergTableMetadataUpdateState st
IcebergSnapshot icebergSnapshot = u.snapshot();
Integer schemaId = icebergSnapshot.schemaId();
NessieTableSnapshot snapshot = state.snapshot();
NessieTableSnapshot.Builder snapshotBuilder = state.builder();
if (schemaId != null) {
Optional<NessieSchema> schema = snapshot.schemaByIcebergId(schemaId);
schema.ifPresent(s -> state.builder().currentSchemaId(s.id()));
schema.ifPresent(s -> snapshotBuilder.currentSchemaId(s.id()));
}

state
.builder()
var currentIcebergSnapshotId = snapshot.icebergSnapshotId();
if (currentIcebergSnapshotId != null && currentIcebergSnapshotId != -1L) {
snapshotBuilder.addPreviousIcebergSnapshotId(currentIcebergSnapshotId);
}

snapshotBuilder
.icebergSnapshotId(icebergSnapshot.snapshotId())
.icebergSnapshotSequenceNumber(icebergSnapshot.sequenceNumber())
.icebergLastSequenceNumber(
Expand All @@ -1599,6 +1626,23 @@ public static void addSnapshot(AddSnapshot u, IcebergTableMetadataUpdateState st
.icebergManifestListLocation(icebergSnapshot.manifestList())
.icebergManifestFileLocations(icebergSnapshot.manifests());

// If another Iceberg snapshot was add in this same update-transaction, memoize the previous
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: add -> added

// snapshot so we can return the full snapshot history.
var previouslyAddedSnapshot = state.previouslyAddedSnapshot();
if (previouslyAddedSnapshot != null) {
snapshotBuilder.putImplicitIcebergSnapshot(
previouslyAddedSnapshot.snapshotId(),
ImmutableImplicitIcebergSnapshot.builder()
.snapshotId(previouslyAddedSnapshot.snapshotId())
.schemaId(previouslyAddedSnapshot.schemaId())
.manifests(previouslyAddedSnapshot.manifests())
.manifestList(previouslyAddedSnapshot.manifestList())
.sequenceNumber(previouslyAddedSnapshot.sequenceNumber())
.summary(previouslyAddedSnapshot.summary())
.timestampMs(previouslyAddedSnapshot.timestampMs())
.build());
}

state.snapshotAdded(icebergSnapshot);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
package org.projectnessie.catalog.formats.iceberg.rest;

import static com.google.common.base.Preconditions.checkArgument;
import static com.google.common.base.Preconditions.checkState;
import static java.util.Collections.emptyList;
import static java.util.Collections.singleton;
Expand All @@ -37,6 +38,7 @@
import java.util.Map;
import java.util.Objects;
import java.util.Set;
import java.util.stream.Collectors;
import javax.annotation.Nullable;
import org.immutables.value.Value;
import org.projectnessie.catalog.formats.iceberg.meta.IcebergPartitionSpec;
Expand Down Expand Up @@ -156,8 +158,14 @@ interface RemoveSnapshots extends IcebergMetadataUpdate {

@Override
default void applyToTable(IcebergTableMetadataUpdateState state) {
throw new UnsupportedOperationException(
"Nessie Catalog does not allow external snapshot management");
state.addCatalogOp(CatalogOps.META_REMOVE_SNAPSHOTS);
var ids = new HashSet<>(snapshotIds());
state
.builder()
.previousIcebergSnapshotIds(
state.snapshot().previousIcebergSnapshotIds().stream()
.filter(id -> !ids.contains(id))
.collect(Collectors.toList()));
}
}

Expand Down Expand Up @@ -476,6 +484,19 @@ default void applyToTable(IcebergTableMetadataUpdateState state) {
state.addCatalogOp(CatalogOps.META_ADD_SNAPSHOT);
Map<String, String> summary = snapshot().summary();

Long currentSnapshotId = state.snapshot().icebergSnapshotId();
Long parentSnapshotId = snapshot().parentSnapshotId();
checkArgument(
parentSnapshotId == null || Objects.equals(currentSnapshotId, parentSnapshotId),
"Snapshot to be added must use the parent snapshot ID %s or null, but provided parent snapshot ID is %s",
currentSnapshotId,
parentSnapshotId);
checkArgument(
(currentSnapshotId == null || snapshot().snapshotId() != currentSnapshotId)
&& !state.snapshot().previousIcebergSnapshotIds().contains(snapshot().snapshotId()),
"Provided snapshot ID %s is already used",
snapshot().snapshotId());

String v = summary.get("added-data-files");
if (v != null && Long.parseLong(v) > 0) {
state.addCatalogOp(CatalogOps.SNAP_ADD_DATA_FILES);
Expand Down
Loading