-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[low-code CDK] Rsumable full refresh support for low-code streams #38300
Changes from 3 commits
cabc93a
6b021e9
5f597a7
11c0ff8
a26d30b
4071469
8f4a1e7
20d01e8
e19644f
d001858
2ed34fa
1204e46
2d35cf8
3f544cc
f968da7
a2693c1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
# Copyright (c) 2024 Airbyte, Inc., all rights reserved. | ||
|
||
from dataclasses import InitVar, dataclass | ||
from typing import Any, Iterable, Mapping, Optional | ||
|
||
from airbyte_cdk.sources.declarative.incremental import DeclarativeCursor | ||
from airbyte_cdk.sources.declarative.types import Record, StreamSlice, StreamState | ||
|
||
|
||
@dataclass | ||
class ResumableFullRefreshCursor(DeclarativeCursor): | ||
parameters: InitVar[Mapping[str, Any]] | ||
|
||
def __post_init__(self, parameters: Mapping[str, Any]) -> None: | ||
self._cursor: StreamState = {} | ||
|
||
def get_stream_state(self) -> StreamState: | ||
return self._cursor | ||
|
||
def set_initial_state(self, stream_state: StreamState) -> None: | ||
self._cursor = stream_state | ||
|
||
def observe(self, stream_slice: StreamSlice, record: Record) -> None: | ||
""" | ||
Resumable full refresh manages state using a page number so it does not need to update state by observing incoming records. | ||
""" | ||
pass | ||
|
||
def close_slice(self, stream_slice: StreamSlice, *args: Any) -> None: | ||
# The ResumableFullRefreshCursor doesn't support nested streams yet so receiving a partition is unexpected | ||
if stream_slice.partition: | ||
raise ValueError(f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}.") | ||
self._cursor = stream_slice.cursor_slice | ||
|
||
def should_be_synced(self, record: Record) -> bool: | ||
""" | ||
Unlike date-based cursors which filter out records outside slice boundaries, resumable full refresh records exist within pages | ||
that don't have filterable bounds. We should always return them. | ||
""" | ||
return True | ||
|
||
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is the one field that really doesn't fit the existing cursor interface. We can default to false, and ultimately we don't even care about the record since we close the slice based on the page number. |
||
""" | ||
RFR record don't have ordering to be compared between one another. | ||
""" | ||
return False | ||
|
||
def select_state(self, stream_slice: Optional[StreamSlice] = None) -> Optional[StreamState]: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a new interface method that we need in order to sanely read through state from the CheckpointReader. It isn't that useful for unnested streams, but this is critical for us to be able to sanely parse substream state as we iterate over parent record state in the CheckpointReader when we do the work |
||
# A top-level RFR cursor only manages the state of a single partition | ||
return self._cursor | ||
|
||
def stream_slices(self) -> Iterable[StreamSlice]: | ||
""" | ||
Resumable full refresh cursors only return a single slice and can't perform partitioning because iteration is done per-page | ||
along an unbounded set. | ||
""" | ||
yield from [StreamSlice(cursor_slice=self._cursor, partition={})] | ||
|
||
# This is an interesting pattern that might not seem obvious at first glance. This cursor itself has no functional need to | ||
# inject any request values into the outbound response because the up-to-date pagination state is already loaded and | ||
# maintained by the paginator component | ||
erohmensing marked this conversation as resolved.
Show resolved
Hide resolved
|
||
def get_request_params( | ||
self, | ||
*, | ||
stream_state: Optional[StreamState] = None, | ||
stream_slice: Optional[StreamSlice] = None, | ||
next_page_token: Optional[Mapping[str, Any]] = None, | ||
) -> Mapping[str, Any]: | ||
return {} | ||
|
||
def get_request_headers( | ||
self, | ||
*, | ||
stream_state: Optional[StreamState] = None, | ||
stream_slice: Optional[StreamSlice] = None, | ||
next_page_token: Optional[Mapping[str, Any]] = None, | ||
) -> Mapping[str, Any]: | ||
return {} | ||
|
||
def get_request_body_data( | ||
self, | ||
*, | ||
stream_state: Optional[StreamState] = None, | ||
stream_slice: Optional[StreamSlice] = None, | ||
next_page_token: Optional[Mapping[str, Any]] = None, | ||
) -> Mapping[str, Any]: | ||
return {} | ||
|
||
def get_request_body_json( | ||
self, | ||
*, | ||
stream_state: Optional[StreamState] = None, | ||
stream_slice: Optional[StreamSlice] = None, | ||
next_page_token: Optional[Mapping[str, Any]] = None, | ||
) -> Mapping[str, Any]: | ||
return {} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,13 @@ | |
from airbyte_cdk.sources.declarative.decoders import JsonDecoder | ||
from airbyte_cdk.sources.declarative.extractors import DpathExtractor, RecordFilter, RecordSelector | ||
from airbyte_cdk.sources.declarative.extractors.record_selector import SCHEMA_TRANSFORMER_TYPE_MAPPING | ||
from airbyte_cdk.sources.declarative.incremental import CursorFactory, DatetimeBasedCursor, DeclarativeCursor, PerPartitionCursor | ||
from airbyte_cdk.sources.declarative.incremental import ( | ||
CursorFactory, | ||
DatetimeBasedCursor, | ||
DeclarativeCursor, | ||
PerPartitionCursor, | ||
ResumableFullRefreshCursor, | ||
) | ||
from airbyte_cdk.sources.declarative.interpolation import InterpolatedString | ||
from airbyte_cdk.sources.declarative.interpolation.interpolated_mapping import InterpolatedMapping | ||
from airbyte_cdk.sources.declarative.migrations.legacy_to_per_partition_state_migration import LegacyToPerPartitionStateMigration | ||
|
@@ -668,6 +674,10 @@ def _merge_stream_slicers(self, model: DeclarativeStreamModel, config: Config) - | |
) | ||
elif model.incremental_sync: | ||
return self._create_component_from_model(model=model.incremental_sync, config=config) if model.incremental_sync else None | ||
elif hasattr(model.retriever, "paginator") and model.retriever.paginator and not stream_slicer: | ||
# To incrementally deliver RFR for low-code we're first implementing this for streams that do not use | ||
# nested state like substreams or those using list partition routers | ||
return ResumableFullRefreshCursor(parameters={}) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep I have it filed here when the original spec was written: https://github.com/airbytehq/airbyte-internal-issues/issues/7528 |
||
elif stream_slicer: | ||
return stream_slicer | ||
else: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
more evidence the cursor belongs to the stream, not the retriever