-
Notifications
You must be signed in to change notification settings - Fork 4.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
🐛 low-code: Fix incremental substreams #35471
Changes from 59 commits
10d06bc
7b9af90
82f8915
b7f45eb
0dc78b7
5b90e2b
9f78777
c503691
624eddc
35db4cf
94f2c40
e316f5f
3bdc41e
ccd0645
6dce076
7360b8c
601d15d
c94ef81
b8eb8db
288e56c
c5fac71
af8f439
49b8a26
90254dd
cfb69eb
401518c
3df9648
8eb4276
525843a
53d1b86
0e90bf5
b350af8
78e4364
5ca282a
203819c
2d2156f
529c8b3
401c16d
02b0216
87ba418
6c97f09
da47861
916767a
430f1da
0fc4d85
4542df2
5d5be67
9d29852
f57c722
3864c75
9f1165d
0dd1c66
65a8dc9
72c3104
b625d52
5e4ce20
125424a
2626467
3dd24e3
87ebdbc
2edd16c
2aa68d5
6c3df15
728cd62
f1a415e
dc01934
75a851c
9560422
6dbc44e
c149d6d
64e408d
d3dddfa
c7b70e8
641d735
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
|
||
import datetime | ||
from dataclasses import InitVar, dataclass, field | ||
from typing import Any, Iterable, List, Mapping, Optional, Union | ||
from typing import Any, Callable, Iterable, List, Mapping, MutableMapping, Optional, Union | ||
|
||
from airbyte_cdk.models import AirbyteLogMessage, AirbyteMessage, Level, Type | ||
from airbyte_cdk.sources.declarative.datetime.datetime_parser import DatetimeParser | ||
|
@@ -13,7 +13,7 @@ | |
from airbyte_cdk.sources.declarative.interpolation.interpolated_string import InterpolatedString | ||
from airbyte_cdk.sources.declarative.interpolation.jinja import JinjaInterpolation | ||
from airbyte_cdk.sources.declarative.requesters.request_option import RequestOption, RequestOptionType | ||
from airbyte_cdk.sources.declarative.types import Config, Record, StreamSlice, StreamState | ||
from airbyte_cdk.sources.declarative.types import Config, Record, StreamState, StreamSlice | ||
from airbyte_cdk.sources.message import MessageRepository | ||
from isodate import Duration, parse_duration | ||
|
||
|
@@ -70,10 +70,16 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: | |
f"If step is defined, cursor_granularity should be as well and vice-versa. " | ||
f"Right now, step is `{self.step}` and cursor_granularity is `{self.cursor_granularity}`" | ||
) | ||
if not isinstance(self.start_datetime, MinMaxDatetime): | ||
self.start_datetime = MinMaxDatetime(self.start_datetime, parameters) | ||
if self.end_datetime and not isinstance(self.end_datetime, MinMaxDatetime): | ||
self.end_datetime = MinMaxDatetime(self.end_datetime, parameters) | ||
self._start_datetime = ( | ||
MinMaxDatetime(self.start_datetime, parameters) if not isinstance(self.start_datetime, MinMaxDatetime) else self.start_datetime | ||
) | ||
self._end_datetime = ( | ||
None | ||
if not self.end_datetime | ||
else MinMaxDatetime(self.end_datetime, parameters) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here, nested ifs are harder to read — perhaps if MinMaxtime is always called on that, we cut out one if, and it's easier to read instantly. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. good idea. Added a factory method for |
||
if not isinstance(self.end_datetime, MinMaxDatetime) | ||
else self.end_datetime | ||
) | ||
|
||
self._timezone = datetime.timezone.utc | ||
self._interpolation = JinjaInterpolation() | ||
|
@@ -84,23 +90,23 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None: | |
else datetime.timedelta.max | ||
) | ||
self._cursor_granularity = self._parse_timedelta(self.cursor_granularity) | ||
self.cursor_field = InterpolatedString.create(self.cursor_field, parameters=parameters) | ||
self.lookback_window = InterpolatedString.create(self.lookback_window, parameters=parameters) | ||
self.partition_field_start = InterpolatedString.create(self.partition_field_start or "start_time", parameters=parameters) | ||
self.partition_field_end = InterpolatedString.create(self.partition_field_end or "end_time", parameters=parameters) | ||
self._cursor_field = InterpolatedString.create(self.cursor_field, parameters=parameters) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This seems like a breaking change. I understand the want to make more things private but can we confirm this won't have negative impact at least in our repo. Else, is this necessary for this PR? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The class should only be used by a retriever and the only implementation in our codebase in the SimpleRetriever. Context for the change was satisfying mypy. The alternative if we don't want to start using a private variable is to add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the context! I'm fine with this being breaking and we move one step in the right direction |
||
self._lookback_window = InterpolatedString.create(self.lookback_window, parameters=parameters) if self.lookback_window else None | ||
self._partition_field_start = InterpolatedString.create(self.partition_field_start or "start_time", parameters=parameters) | ||
self._partition_field_end = InterpolatedString.create(self.partition_field_end or "end_time", parameters=parameters) | ||
self._parser = DatetimeParser() | ||
|
||
# If datetime format is not specified then start/end datetime should inherit it from the stream slicer | ||
if not self.start_datetime.datetime_format: | ||
self.start_datetime.datetime_format = self.datetime_format | ||
if self.end_datetime and not self.end_datetime.datetime_format: | ||
self.end_datetime.datetime_format = self.datetime_format | ||
if not self._start_datetime.datetime_format: | ||
self._start_datetime.datetime_format = self.datetime_format | ||
if self._end_datetime and not self._end_datetime.datetime_format: | ||
self._end_datetime.datetime_format = self.datetime_format | ||
|
||
if not self.cursor_datetime_formats: | ||
self.cursor_datetime_formats = [self.datetime_format] | ||
|
||
def get_stream_state(self) -> StreamState: | ||
return {self.cursor_field.eval(self.config): self._cursor} if self._cursor else {} | ||
return {self._cursor_field.eval(self.config): self._cursor} if self._cursor else {} | ||
|
||
def set_initial_state(self, stream_state: StreamState) -> None: | ||
""" | ||
|
@@ -109,17 +115,22 @@ def set_initial_state(self, stream_state: StreamState) -> None: | |
|
||
:param stream_state: The state of the stream as returned by get_stream_state | ||
""" | ||
self._cursor = stream_state.get(self.cursor_field.eval(self.config)) if stream_state else None | ||
self._cursor = stream_state.get(self._cursor_field.eval(self.config)) if stream_state else None | ||
|
||
def close_slice(self, stream_slice: StreamSlice, most_recent_record: Optional[Record]) -> None: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this interface be changed to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why narrow it? |
||
last_record_cursor_value = most_recent_record.get(self.cursor_field.eval(self.config)) if most_recent_record else None | ||
stream_slice_value_end = stream_slice.get(self.partition_field_end.eval(self.config)) | ||
if stream_slice.partition: | ||
raise ValueError(f"Stream slice {stream_slice} should not have a partition. Got {stream_slice.partition}.") | ||
last_record_cursor_value = most_recent_record.get(self._cursor_field.eval(self.config)) if most_recent_record else None | ||
stream_slice_value_end = stream_slice.get(self._partition_field_end.eval(self.config)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering if it would be worth having visibility on the case where the partition part of the Note that it's not only visibility for us but also for devs trying to write custom Python code and that might do mistakes There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
potential_cursor_values = [ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pulled into a variable to filter out potentially There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does that mean the filter on line 132 is not necessary anymore? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. removed the extra filter |
||
cursor_value for cursor_value in [self._cursor, last_record_cursor_value, stream_slice_value_end] if cursor_value | ||
] | ||
cursor_value_str_by_cursor_value_datetime = dict( | ||
map( | ||
# we need to ensure the cursor value is preserved as is in the state else the CATs might complain of something like | ||
# 2023-01-04T17:30:19.000Z' <= '2023-01-04T17:30:19.000000Z' | ||
lambda datetime_str: (self.parse_date(datetime_str), datetime_str), | ||
filter(lambda item: item, [self._cursor, last_record_cursor_value, stream_slice_value_end]), | ||
potential_cursor_values | ||
) | ||
) | ||
self._cursor = ( | ||
|
@@ -142,37 +153,43 @@ def stream_slices(self) -> Iterable[StreamSlice]: | |
return self._partition_daterange(start_datetime, end_datetime, self._step) | ||
|
||
def _calculate_earliest_possible_value(self, end_datetime: datetime.datetime) -> datetime.datetime: | ||
lookback_delta = self._parse_timedelta(self.lookback_window.eval(self.config) if self.lookback_window else "P0D") | ||
earliest_possible_start_datetime = min(self.start_datetime.get_datetime(self.config), end_datetime) | ||
lookback_delta = self._parse_timedelta(self._lookback_window.eval(self.config) if self.lookback_window else "P0D") | ||
earliest_possible_start_datetime = min(self._start_datetime.get_datetime(self.config), end_datetime) | ||
cursor_datetime = self._calculate_cursor_datetime_from_state(self.get_stream_state()) | ||
return max(earliest_possible_start_datetime, cursor_datetime) - lookback_delta | ||
|
||
def _select_best_end_datetime(self) -> datetime.datetime: | ||
now = datetime.datetime.now(tz=self._timezone) | ||
if not self.end_datetime: | ||
if not self._end_datetime: | ||
return now | ||
return min(self.end_datetime.get_datetime(self.config), now) | ||
return min(self._end_datetime.get_datetime(self.config), now) | ||
|
||
def _calculate_cursor_datetime_from_state(self, stream_state: Mapping[str, Any]) -> datetime.datetime: | ||
if self.cursor_field.eval(self.config, stream_state=stream_state) in stream_state: | ||
return self.parse_date(stream_state[self.cursor_field.eval(self.config)]) | ||
if self._cursor_field.eval(self.config, stream_state=stream_state) in stream_state: | ||
return self.parse_date(stream_state[self._cursor_field.eval(self.config)]) | ||
return datetime.datetime.min.replace(tzinfo=datetime.timezone.utc) | ||
|
||
def _format_datetime(self, dt: datetime.datetime) -> str: | ||
return self._parser.format(dt, self.datetime_format) | ||
|
||
def _partition_daterange(self, start: datetime.datetime, end: datetime.datetime, step: Union[datetime.timedelta, Duration]): | ||
start_field = self.partition_field_start.eval(self.config) | ||
end_field = self.partition_field_end.eval(self.config) | ||
def _partition_daterange( | ||
self, start: datetime.datetime, end: datetime.datetime, step: Union[datetime.timedelta, Duration] | ||
) -> List[StreamSlice]: | ||
start_field = self._partition_field_start.eval(self.config) | ||
end_field = self._partition_field_end.eval(self.config) | ||
dates = [] | ||
while start <= end: | ||
next_start = self._evaluate_next_start_date_safely(start, step) | ||
end_date = self._get_date(next_start - self._cursor_granularity, end, min) | ||
dates.append({start_field: self._format_datetime(start), end_field: self._format_datetime(end_date)}) | ||
dates.append( | ||
StreamSlice( | ||
partition={}, cursor_slice={start_field: self._format_datetime(start), end_field: self._format_datetime(end_date)} | ||
) | ||
) | ||
start = next_start | ||
return dates | ||
|
||
def _evaluate_next_start_date_safely(self, start, step): | ||
def _evaluate_next_start_date_safely(self, start: datetime.datetime, step: datetime.timedelta) -> datetime.datetime: | ||
""" | ||
Given that we set the default step at datetime.timedelta.max, we will generate an OverflowError when evaluating the next start_date | ||
This method assumes that users would never enter a step that would generate an overflow. Given that would be the case, the code | ||
|
@@ -183,7 +200,12 @@ def _evaluate_next_start_date_safely(self, start, step): | |
except OverflowError: | ||
return datetime.datetime.max.replace(tzinfo=datetime.timezone.utc) | ||
|
||
def _get_date(self, cursor_value, default_date: datetime.datetime, comparator) -> datetime.datetime: | ||
def _get_date( | ||
self, | ||
cursor_value: datetime.datetime, | ||
default_date: datetime.datetime, | ||
comparator: Callable[[datetime.datetime, datetime.datetime], datetime.datetime], | ||
) -> datetime.datetime: | ||
cursor_date = cursor_value or default_date | ||
return comparator(cursor_date, default_date) | ||
|
||
|
@@ -196,7 +218,7 @@ def parse_date(self, date: str) -> datetime.datetime: | |
raise ValueError(f"No format in {self.cursor_datetime_formats} matching {date}") | ||
|
||
@classmethod | ||
def _parse_timedelta(cls, time_str) -> Union[datetime.timedelta, Duration]: | ||
def _parse_timedelta(cls, time_str: Optional[str]) -> Union[datetime.timedelta, Duration]: | ||
""" | ||
:return Parses an ISO 8601 durations into datetime.timedelta or Duration objects. | ||
""" | ||
|
@@ -244,18 +266,20 @@ def request_kwargs(self) -> Mapping[str, Any]: | |
# Never update kwargs | ||
return {} | ||
|
||
def _get_request_options(self, option_type: RequestOptionType, stream_slice: StreamSlice): | ||
options = {} | ||
def _get_request_options(self, option_type: RequestOptionType, stream_slice: Optional[StreamSlice]) -> Mapping[str, Any]: | ||
options: MutableMapping[str, Any] = {} | ||
if not stream_slice: | ||
return options | ||
if self.start_time_option and self.start_time_option.inject_into == option_type: | ||
options[self.start_time_option.field_name.eval(config=self.config)] = stream_slice.get( | ||
self.partition_field_start.eval(self.config) | ||
options[self.start_time_option.field_name.eval(config=self.config)] = stream_slice.get( # type: ignore # field_name is always casted to an interpolated string | ||
self._partition_field_start.eval(self.config) | ||
) | ||
if self.end_time_option and self.end_time_option.inject_into == option_type: | ||
options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get(self.partition_field_end.eval(self.config)) | ||
options[self.end_time_option.field_name.eval(config=self.config)] = stream_slice.get(self._partition_field_end.eval(self.config)) # type: ignore # field_name is always casted to an interpolated string | ||
return options | ||
|
||
def should_be_synced(self, record: Record) -> bool: | ||
cursor_field = self.cursor_field.eval(self.config) | ||
cursor_field = self._cursor_field.eval(self.config) | ||
record_cursor_value = record.get(cursor_field) | ||
if not record_cursor_value: | ||
self._send_log( | ||
|
@@ -278,7 +302,7 @@ def _send_log(self, level: Level, message: str) -> None: | |
) | ||
|
||
def is_greater_than_or_equal(self, first: Record, second: Record) -> bool: | ||
cursor_field = self.cursor_field.eval(self.config) | ||
cursor_field = self._cursor_field.eval(self.config) | ||
first_cursor_value = first.get(cursor_field) | ||
second_cursor_value = second.get(cursor_field) | ||
if first_cursor_value and second_cursor_value: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't know how exactly
MinMaxDatetime
works, but could we make it's constructor always return an object ofMinMaxTime
and acceptMinMaxtime
too so that we can always wrap start_datetime into it, cutting on one if statement?Sure, that's one more function call, but perhaps more readable?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good idea. Added a factory method for
MinMaxDatetime
and simplified this call