Skip to content

Commit 0e7802a

Browse files
authored
feat(Low-Code Concurrent CDK): Make SimpleRetriever thread-safe so that different partitions can share the same SimpleRetriever (#185)
1 parent 6d5ce67 commit 0e7802a

22 files changed

+635
-435
lines changed

airbyte_cdk/sources/declarative/concurrent_declarative_source.py

Lines changed: 24 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
#
44

55
import logging
6-
from typing import Any, Callable, Generic, Iterator, List, Mapping, Optional, Tuple, Union
6+
from typing import Any, Generic, Iterator, List, Mapping, Optional, Tuple
77

88
from airbyte_cdk.models import (
99
AirbyteCatalog,
@@ -28,15 +28,11 @@
2828
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
2929
DatetimeBasedCursor as DatetimeBasedCursorModel,
3030
)
31-
from airbyte_cdk.sources.declarative.models.declarative_component_schema import (
32-
DeclarativeStream as DeclarativeStreamModel,
33-
)
3431
from airbyte_cdk.sources.declarative.parsers.model_to_component_factory import (
35-
ComponentDefinition,
3632
ModelToComponentFactory,
3733
)
3834
from airbyte_cdk.sources.declarative.requesters import HttpRequester
39-
from airbyte_cdk.sources.declarative.retrievers import Retriever, SimpleRetriever
35+
from airbyte_cdk.sources.declarative.retrievers import SimpleRetriever
4036
from airbyte_cdk.sources.declarative.stream_slicers.declarative_partition_generator import (
4137
DeclarativePartitionFactory,
4238
StreamSlicerPartitionGenerator,
@@ -52,7 +48,6 @@
5248
from airbyte_cdk.sources.streams.concurrent.cursor import FinalStateCursor
5349
from airbyte_cdk.sources.streams.concurrent.default_stream import DefaultStream
5450
from airbyte_cdk.sources.streams.concurrent.helpers import get_primary_key_from_stream
55-
from airbyte_cdk.sources.types import Config, StreamState
5651

5752

5853
class ConcurrentDeclarativeSource(ManifestDeclarativeSource, Generic[TState]):
@@ -194,10 +189,11 @@ def _group_streams(
194189
# Some low-code sources use a combination of DeclarativeStream and regular Python streams. We can't inspect
195190
# these legacy Python streams the way we do low-code streams to determine if they are concurrent compatible,
196191
# so we need to treat them as synchronous
197-
if (
198-
isinstance(declarative_stream, DeclarativeStream)
199-
and name_to_stream_mapping[declarative_stream.name]["retriever"]["type"]
192+
if isinstance(declarative_stream, DeclarativeStream) and (
193+
name_to_stream_mapping[declarative_stream.name]["retriever"]["type"]
200194
== "SimpleRetriever"
195+
or name_to_stream_mapping[declarative_stream.name]["retriever"]["type"]
196+
== "AsyncRetriever"
201197
):
202198
incremental_sync_component_definition = name_to_stream_mapping[
203199
declarative_stream.name
@@ -234,15 +230,27 @@ def _group_streams(
234230
stream_state=stream_state,
235231
)
236232

233+
retriever = declarative_stream.retriever
234+
235+
# This is an optimization so that we don't invoke any cursor or state management flows within the
236+
# low-code framework because state management is handled through the ConcurrentCursor.
237+
if declarative_stream and isinstance(retriever, SimpleRetriever):
238+
# Also a temporary hack. In the legacy Stream implementation, as part of the read,
239+
# set_initial_state() is called to instantiate incoming state on the cursor. Although we no
240+
# longer rely on the legacy low-code cursor for concurrent checkpointing, low-code components
241+
# like StopConditionPaginationStrategyDecorator and ClientSideIncrementalRecordFilterDecorator
242+
# still rely on a DatetimeBasedCursor that is properly initialized with state.
243+
if retriever.cursor:
244+
retriever.cursor.set_initial_state(stream_state=stream_state)
245+
# We zero it out here, but since this is a cursor reference, the state is still properly
246+
# instantiated for the other components that reference it
247+
retriever.cursor = None
248+
237249
partition_generator = StreamSlicerPartitionGenerator(
238250
DeclarativePartitionFactory(
239251
declarative_stream.name,
240252
declarative_stream.get_json_schema(),
241-
self._retriever_factory(
242-
name_to_stream_mapping[declarative_stream.name],
243-
config,
244-
stream_state,
245-
),
253+
retriever,
246254
self.message_repository,
247255
),
248256
cursor,
@@ -272,11 +280,7 @@ def _group_streams(
272280
DeclarativePartitionFactory(
273281
declarative_stream.name,
274282
declarative_stream.get_json_schema(),
275-
self._retriever_factory(
276-
name_to_stream_mapping[declarative_stream.name],
277-
config,
278-
{},
279-
),
283+
declarative_stream.retriever,
280284
self.message_repository,
281285
),
282286
declarative_stream.retriever.stream_slicer,
@@ -415,34 +419,3 @@ def _remove_concurrent_streams_from_catalog(
415419
if stream.stream.name not in concurrent_stream_names
416420
]
417421
)
418-
419-
def _retriever_factory(
420-
self, stream_config: ComponentDefinition, source_config: Config, stream_state: StreamState
421-
) -> Callable[[], Retriever]:
422-
def _factory_method() -> Retriever:
423-
declarative_stream: DeclarativeStream = self._constructor.create_component(
424-
DeclarativeStreamModel,
425-
stream_config,
426-
source_config,
427-
emit_connector_builder_messages=self._emit_connector_builder_messages,
428-
)
429-
430-
# This is an optimization so that we don't invoke any cursor or state management flows within the
431-
# low-code framework because state management is handled through the ConcurrentCursor.
432-
if (
433-
declarative_stream
434-
and declarative_stream.retriever
435-
and isinstance(declarative_stream.retriever, SimpleRetriever)
436-
):
437-
# Also a temporary hack. In the legacy Stream implementation, as part of the read, set_initial_state() is
438-
# called to instantiate incoming state on the cursor. Although we no longer rely on the legacy low-code cursor
439-
# for concurrent checkpointing, low-code components like StopConditionPaginationStrategyDecorator and
440-
# ClientSideIncrementalRecordFilterDecorator still rely on a DatetimeBasedCursor that is properly initialized
441-
# with state.
442-
if declarative_stream.retriever.cursor:
443-
declarative_stream.retriever.cursor.set_initial_state(stream_state=stream_state)
444-
declarative_stream.retriever.cursor = None
445-
446-
return declarative_stream.retriever
447-
448-
return _factory_method

airbyte_cdk/sources/declarative/requesters/paginators/default_paginator.py

Lines changed: 52 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -112,27 +112,39 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
112112
)
113113
if isinstance(self.url_base, str):
114114
self.url_base = InterpolatedString(string=self.url_base, parameters=parameters)
115-
self._token: Optional[Any] = self.pagination_strategy.initial_token
115+
116+
def get_initial_token(self) -> Optional[Any]:
117+
"""
118+
Return the page token that should be used for the first request of a stream
119+
120+
WARNING: get_initial_token() should not be used by streams that use RFR that perform checkpointing
121+
of state using page numbers. Because paginators are stateless
122+
"""
123+
return self.pagination_strategy.initial_token
116124

117125
def next_page_token(
118-
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
126+
self,
127+
response: requests.Response,
128+
last_page_size: int,
129+
last_record: Optional[Record],
130+
last_page_token_value: Optional[Any] = None,
119131
) -> Optional[Mapping[str, Any]]:
120-
self._token = self.pagination_strategy.next_page_token(
121-
response, last_page_size, last_record
132+
next_page_token = self.pagination_strategy.next_page_token(
133+
response=response,
134+
last_page_size=last_page_size,
135+
last_record=last_record,
136+
last_page_token_value=last_page_token_value,
122137
)
123-
if self._token:
124-
return {"next_page_token": self._token}
138+
if next_page_token:
139+
return {"next_page_token": next_page_token}
125140
else:
126141
return None
127142

128-
def path(self) -> Optional[str]:
129-
if (
130-
self._token
131-
and self.page_token_option
132-
and isinstance(self.page_token_option, RequestPath)
133-
):
143+
def path(self, next_page_token: Optional[Mapping[str, Any]]) -> Optional[str]:
144+
token = next_page_token.get("next_page_token") if next_page_token else None
145+
if token and self.page_token_option and isinstance(self.page_token_option, RequestPath):
134146
# Replace url base to only return the path
135-
return str(self._token).replace(self.url_base.eval(self.config), "") # type: ignore # url_base is casted to a InterpolatedString in __post_init__
147+
return str(token).replace(self.url_base.eval(self.config), "") # type: ignore # url_base is casted to a InterpolatedString in __post_init__
136148
else:
137149
return None
138150

@@ -143,7 +155,7 @@ def get_request_params(
143155
stream_slice: Optional[StreamSlice] = None,
144156
next_page_token: Optional[Mapping[str, Any]] = None,
145157
) -> MutableMapping[str, Any]:
146-
return self._get_request_options(RequestOptionType.request_parameter)
158+
return self._get_request_options(RequestOptionType.request_parameter, next_page_token)
147159

148160
def get_request_headers(
149161
self,
@@ -152,7 +164,7 @@ def get_request_headers(
152164
stream_slice: Optional[StreamSlice] = None,
153165
next_page_token: Optional[Mapping[str, Any]] = None,
154166
) -> Mapping[str, str]:
155-
return self._get_request_options(RequestOptionType.header)
167+
return self._get_request_options(RequestOptionType.header, next_page_token)
156168

157169
def get_request_body_data(
158170
self,
@@ -161,7 +173,7 @@ def get_request_body_data(
161173
stream_slice: Optional[StreamSlice] = None,
162174
next_page_token: Optional[Mapping[str, Any]] = None,
163175
) -> Mapping[str, Any]:
164-
return self._get_request_options(RequestOptionType.body_data)
176+
return self._get_request_options(RequestOptionType.body_data, next_page_token)
165177

166178
def get_request_body_json(
167179
self,
@@ -170,25 +182,21 @@ def get_request_body_json(
170182
stream_slice: Optional[StreamSlice] = None,
171183
next_page_token: Optional[Mapping[str, Any]] = None,
172184
) -> Mapping[str, Any]:
173-
return self._get_request_options(RequestOptionType.body_json)
174-
175-
def reset(self, reset_value: Optional[Any] = None) -> None:
176-
if reset_value:
177-
self.pagination_strategy.reset(reset_value=reset_value)
178-
else:
179-
self.pagination_strategy.reset()
180-
self._token = self.pagination_strategy.initial_token
185+
return self._get_request_options(RequestOptionType.body_json, next_page_token)
181186

182-
def _get_request_options(self, option_type: RequestOptionType) -> MutableMapping[str, Any]:
187+
def _get_request_options(
188+
self, option_type: RequestOptionType, next_page_token: Optional[Mapping[str, Any]]
189+
) -> MutableMapping[str, Any]:
183190
options = {}
184191

192+
token = next_page_token.get("next_page_token") if next_page_token else None
185193
if (
186194
self.page_token_option
187-
and self._token is not None
195+
and token is not None
188196
and isinstance(self.page_token_option, RequestOption)
189197
and self.page_token_option.inject_into == option_type
190198
):
191-
options[self.page_token_option.field_name.eval(config=self.config)] = self._token # type: ignore # field_name is always cast to an interpolated string
199+
options[self.page_token_option.field_name.eval(config=self.config)] = token # type: ignore # field_name is always cast to an interpolated string
192200
if (
193201
self.page_size_option
194202
and self.pagination_strategy.get_page_size()
@@ -204,6 +212,9 @@ class PaginatorTestReadDecorator(Paginator):
204212
"""
205213
In some cases, we want to limit the number of requests that are made to the backend source. This class allows for limiting the number of
206214
pages that are queried throughout a read command.
215+
216+
WARNING: This decorator is not currently thread-safe like the rest of the low-code framework because it has
217+
an internal state to track the current number of pages counted so that it can exit early during a test read
207218
"""
208219

209220
_PAGE_COUNT_BEFORE_FIRST_NEXT_CALL = 1
@@ -217,17 +228,27 @@ def __init__(self, decorated: Paginator, maximum_number_of_pages: int = 5) -> No
217228
self._decorated = decorated
218229
self._page_count = self._PAGE_COUNT_BEFORE_FIRST_NEXT_CALL
219230

231+
def get_initial_token(self) -> Optional[Any]:
232+
self._page_count = self._PAGE_COUNT_BEFORE_FIRST_NEXT_CALL
233+
return self._decorated.get_initial_token()
234+
220235
def next_page_token(
221-
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
236+
self,
237+
response: requests.Response,
238+
last_page_size: int,
239+
last_record: Optional[Record],
240+
last_page_token_value: Optional[Any] = None,
222241
) -> Optional[Mapping[str, Any]]:
223242
if self._page_count >= self._maximum_number_of_pages:
224243
return None
225244

226245
self._page_count += 1
227-
return self._decorated.next_page_token(response, last_page_size, last_record)
246+
return self._decorated.next_page_token(
247+
response, last_page_size, last_record, last_page_token_value
248+
)
228249

229-
def path(self) -> Optional[str]:
230-
return self._decorated.path()
250+
def path(self, next_page_token: Optional[Mapping[str, Any]]) -> Optional[str]:
251+
return self._decorated.path(next_page_token)
231252

232253
def get_request_params(
233254
self,
@@ -272,7 +293,3 @@ def get_request_body_json(
272293
return self._decorated.get_request_body_json(
273294
stream_state=stream_state, stream_slice=stream_slice, next_page_token=next_page_token
274295
)
275-
276-
def reset(self, reset_value: Optional[Any] = None) -> None:
277-
self._decorated.reset()
278-
self._page_count = self._PAGE_COUNT_BEFORE_FIRST_NEXT_CALL

airbyte_cdk/sources/declarative/requesters/paginators/no_pagination.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ class NoPagination(Paginator):
1919

2020
parameters: InitVar[Mapping[str, Any]]
2121

22-
def path(self) -> Optional[str]:
22+
def path(self, next_page_token: Optional[Mapping[str, Any]]) -> Optional[str]:
2323
return None
2424

2525
def get_request_params(
@@ -58,11 +58,14 @@ def get_request_body_json(
5858
) -> Mapping[str, Any]:
5959
return {}
6060

61+
def get_initial_token(self) -> Optional[Any]:
62+
return None
63+
6164
def next_page_token(
62-
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
63-
) -> Mapping[str, Any]:
65+
self,
66+
response: requests.Response,
67+
last_page_size: int,
68+
last_record: Optional[Record],
69+
last_page_token_value: Optional[Any],
70+
) -> Optional[Mapping[str, Any]]:
6471
return {}
65-
66-
def reset(self, reset_value: Optional[Any] = None) -> None:
67-
# No state to reset
68-
pass

airbyte_cdk/sources/declarative/requesters/paginators/paginator.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,32 @@ class Paginator(ABC, RequestOptionsProvider):
2424
"""
2525

2626
@abstractmethod
27-
def reset(self, reset_value: Optional[Any] = None) -> None:
27+
def get_initial_token(self) -> Optional[Any]:
2828
"""
29-
Reset the pagination's inner state
29+
Get the page token that should be included in the request to get the first page of records
3030
"""
3131

3232
@abstractmethod
3333
def next_page_token(
34-
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
34+
self,
35+
response: requests.Response,
36+
last_page_size: int,
37+
last_record: Optional[Record],
38+
last_page_token_value: Optional[Any],
3539
) -> Optional[Mapping[str, Any]]:
3640
"""
3741
Returns the next_page_token to use to fetch the next page of records.
3842
3943
:param response: the response to process
4044
:param last_page_size: the number of records read from the response
4145
:param last_record: the last record extracted from the response
46+
:param last_page_token_value: The current value of the page token made on the last request
4247
:return: A mapping {"next_page_token": <token>} for the next page from the input response object. Returning None means there are no more pages to read in this response.
4348
"""
4449
pass
4550

4651
@abstractmethod
47-
def path(self) -> Optional[str]:
52+
def path(self, next_page_token: Optional[Mapping[str, Any]]) -> Optional[str]:
4853
"""
4954
Returns the URL path to hit to fetch the next page of records
5055

airbyte_cdk/sources/declarative/requesters/paginators/strategies/cursor_pagination_strategy.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,6 @@ class CursorPaginationStrategy(PaginationStrategy):
4343
)
4444

4545
def __post_init__(self, parameters: Mapping[str, Any]) -> None:
46-
self._initial_cursor = None
4746
if isinstance(self.cursor_value, str):
4847
self._cursor_value = InterpolatedString.create(self.cursor_value, parameters=parameters)
4948
else:
@@ -57,10 +56,19 @@ def __post_init__(self, parameters: Mapping[str, Any]) -> None:
5756

5857
@property
5958
def initial_token(self) -> Optional[Any]:
60-
return self._initial_cursor
59+
"""
60+
CursorPaginationStrategy does not have an initial value because the next cursor is typically included
61+
in the response of the first request. For Resumable Full Refresh streams that checkpoint the page
62+
cursor, the next cursor should be read from the state or stream slice object.
63+
"""
64+
return None
6165

6266
def next_page_token(
63-
self, response: requests.Response, last_page_size: int, last_record: Optional[Record]
67+
self,
68+
response: requests.Response,
69+
last_page_size: int,
70+
last_record: Optional[Record],
71+
last_page_token_value: Optional[Any] = None,
6472
) -> Optional[Any]:
6573
decoded_response = next(self.decoder.decode(response))
6674

@@ -87,8 +95,5 @@ def next_page_token(
8795
)
8896
return token if token else None
8997

90-
def reset(self, reset_value: Optional[Any] = None) -> None:
91-
self._initial_cursor = reset_value
92-
9398
def get_page_size(self) -> Optional[int]:
9499
return self.page_size

0 commit comments

Comments
 (0)