From 45d70eacf0912699759e07eb5f54a83df2fb1e95 Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Mon, 11 Sep 2023 18:13:36 +0200 Subject: [PATCH 01/10] [docs] added api reference, test run --- .../docs/api_reference/dlt/sources.html | 241 ++++ .../dlt/sources/credentials.html | 241 ++++ .../api_reference/dlt/sources/helpers.html | 238 ++++ .../dlt/sources/helpers/requests.html | 315 +++++ .../dlt/sources/helpers/requests/retry.html | 1139 +++++++++++++++++ .../dlt/sources/helpers/requests/session.html | 534 ++++++++ .../dlt/sources/helpers/requests/typing.html | 263 ++++ .../dlt/sources/helpers/transform.html | 323 +++++ docs/website/docs/api_reference/index.html | 7 + docs/website/docs/api_reference/search.js | 46 + docs/website/sidebars.js | 7 + 11 files changed, 3354 insertions(+) create mode 100644 docs/website/docs/api_reference/dlt/sources.html create mode 100644 docs/website/docs/api_reference/dlt/sources/credentials.html create mode 100644 docs/website/docs/api_reference/dlt/sources/helpers.html create mode 100644 docs/website/docs/api_reference/dlt/sources/helpers/requests.html create mode 100644 docs/website/docs/api_reference/dlt/sources/helpers/requests/retry.html create mode 100644 docs/website/docs/api_reference/dlt/sources/helpers/requests/session.html create mode 100644 docs/website/docs/api_reference/dlt/sources/helpers/requests/typing.html create mode 100644 docs/website/docs/api_reference/dlt/sources/helpers/transform.html create mode 100644 docs/website/docs/api_reference/index.html create mode 100644 docs/website/docs/api_reference/search.js diff --git a/docs/website/docs/api_reference/dlt/sources.html b/docs/website/docs/api_reference/dlt/sources.html new file mode 100644 index 0000000000..45d8b6f5cc --- /dev/null +++ b/docs/website/docs/api_reference/dlt/sources.html @@ -0,0 +1,241 @@ + + + + + + + dlt.sources API documentation + + + + + + + + + +
+
+

+dlt.sources

+ +

Module with built in sources and source building blocks

+
+ + + + + +
1"""Module with built in sources and source building blocks"""
+2from dlt.extract.incremental import Incremental as incremental
+
+ + +
+
+ + \ No newline at end of file diff --git a/docs/website/docs/api_reference/dlt/sources/credentials.html b/docs/website/docs/api_reference/dlt/sources/credentials.html new file mode 100644 index 0000000000..67f2d9f0b5 --- /dev/null +++ b/docs/website/docs/api_reference/dlt/sources/credentials.html @@ -0,0 +1,241 @@ + + + + + + + dlt.sources.credentials API documentation + + + + + + + + + +
+
+

+dlt.sources.credentials

+ + + + + + +
1from dlt.common.configuration.specs import GcpServiceAccountCredentials, GcpOAuthCredentials, GcpCredentials
+2from dlt.common.configuration.specs import ConnectionStringCredentials
+3from dlt.common.configuration.specs import OAuth2Credentials
+4from dlt.common.configuration.specs import CredentialsConfiguration, configspec
+
+ + +
+
+ + \ No newline at end of file diff --git a/docs/website/docs/api_reference/dlt/sources/helpers.html b/docs/website/docs/api_reference/dlt/sources/helpers.html new file mode 100644 index 0000000000..2e51f30d90 --- /dev/null +++ b/docs/website/docs/api_reference/dlt/sources/helpers.html @@ -0,0 +1,238 @@ + + + + + + + dlt.sources.helpers API documentation + + + + + + + + + +
+
+

+dlt.sources.helpers

+ + + + + +
+
+ + \ No newline at end of file diff --git a/docs/website/docs/api_reference/dlt/sources/helpers/requests.html b/docs/website/docs/api_reference/dlt/sources/helpers/requests.html new file mode 100644 index 0000000000..0a0dcb7381 --- /dev/null +++ b/docs/website/docs/api_reference/dlt/sources/helpers/requests.html @@ -0,0 +1,315 @@ + + + + + + + dlt.sources.helpers.requests API documentation + + + + + + + + + +
+
+

+dlt.sources.helpers.requests

+ + + + + + +
 1from tenacity import RetryError
+ 2from requests import (
+ 3    Request, Response,
+ 4    ConnectionError,
+ 5    ConnectTimeout,
+ 6    FileModeWarning,
+ 7    HTTPError,
+ 8    ReadTimeout,
+ 9    RequestException,
+10    Timeout,
+11    TooManyRedirects,
+12    URLRequired,
+13)
+14from requests.exceptions import ChunkedEncodingError
+15from dlt.sources.helpers.requests.retry import Client
+16from dlt.sources.helpers.requests.session import Session
+17from dlt.common.configuration.specs import RunConfiguration
+18
+19client = Client()
+20
+21get, post, put, patch, delete, options, head, request = (
+22    client.get, client.post, client.put, client.patch, client.delete, client.options, client.head, client.request
+23)
+24
+25
+26def init(config: RunConfiguration) -> None:
+27    """Initialize the default requests client from config"""
+28    client.update_from_config(config)
+
+ + +
+
+
+ client = +<dlt.sources.helpers.requests.retry.Client object> + + +
+ + + + +
+
+ +
+ + def + init( config: dlt.common.configuration.specs.run_configuration.RunConfiguration) -> None: + + + +
+ +
27def init(config: RunConfiguration) -> None:
+28    """Initialize the default requests client from config"""
+29    client.update_from_config(config)
+
+ + +

Initialize the default requests client from config

+
+ + +
+
+ + \ No newline at end of file diff --git a/docs/website/docs/api_reference/dlt/sources/helpers/requests/retry.html b/docs/website/docs/api_reference/dlt/sources/helpers/requests/retry.html new file mode 100644 index 0000000000..0c9c6ca9fe --- /dev/null +++ b/docs/website/docs/api_reference/dlt/sources/helpers/requests/retry.html @@ -0,0 +1,1139 @@ + + + + + + + dlt.sources.helpers.requests.retry API documentation + + + + + + + + + +
+
+

+dlt.sources.helpers.requests.retry

+ + + + + + +
  1from email.utils import parsedate_tz, mktime_tz
+  2import re
+  3import time
+  4from typing import Optional, cast, Callable, Type, Union, Sequence, Tuple, List, TYPE_CHECKING, Any, Dict
+  5from threading import local
+  6
+  7from requests import Response, HTTPError, Session as BaseSession
+  8from requests.exceptions import ConnectionError, Timeout, ChunkedEncodingError
+  9from requests.adapters import HTTPAdapter
+ 10from tenacity import Retrying, retry_if_exception_type, stop_after_attempt, RetryCallState, retry_any, wait_exponential
+ 11from tenacity.retry import retry_base
+ 12
+ 13from dlt.sources.helpers.requests.session import Session, DEFAULT_TIMEOUT
+ 14from dlt.sources.helpers.requests.typing import TRequestTimeout
+ 15from dlt.common.typing import TimedeltaSeconds
+ 16from dlt.common.configuration.specs import RunConfiguration
+ 17from dlt.common.configuration import with_config
+ 18
+ 19
+ 20DEFAULT_RETRY_STATUS = (429, *range(500, 600))
+ 21DEFAULT_RETRY_EXCEPTIONS = (ConnectionError, Timeout, ChunkedEncodingError)
+ 22
+ 23RetryPredicate = Callable[[Optional[Response], Optional[BaseException]], bool]
+ 24
+ 25
+ 26def _get_retry_response(retry_state: RetryCallState) -> Optional[Response]:
+ 27    ex = retry_state.outcome.exception()
+ 28    if ex:
+ 29        if isinstance(ex, HTTPError):
+ 30            return cast(Response, ex.response)
+ 31        return None
+ 32    result = retry_state.outcome.result()
+ 33    return result if isinstance(result, Response) else None
+ 34
+ 35
+ 36class retry_if_status(retry_base):
+ 37    """Retry for given response status codes"""
+ 38
+ 39    def __init__(self, status_codes: Sequence[int]) -> None:
+ 40        self.status_codes = set(status_codes)
+ 41
+ 42    def __call__(self, retry_state: RetryCallState) -> bool:
+ 43        response = _get_retry_response(retry_state)
+ 44        if response is None:
+ 45            return False
+ 46        result = response.status_code in self.status_codes
+ 47        return result
+ 48
+ 49
+ 50class retry_if_predicate(retry_base):
+ 51    def __init__(self, predicate: RetryPredicate) -> None:
+ 52        self.predicate = predicate
+ 53
+ 54    def __call__(self, retry_state: RetryCallState) -> bool:
+ 55        response = _get_retry_response(retry_state)
+ 56        exception = retry_state.outcome.exception()
+ 57        return self.predicate(response, exception)
+ 58
+ 59
+ 60class wait_exponential_retry_after(wait_exponential):
+ 61    def _parse_retry_after(self, retry_after: str) -> Optional[float]:
+ 62        # Borrowed from urllib3
+ 63        seconds: float
+ 64        # Whitespace: https://tools.ietf.org/html/rfc7230#section-3.2.4
+ 65        if re.match(r"^\s*[0-9]+\s*$", retry_after):
+ 66            seconds = int(retry_after)
+ 67        else:
+ 68            retry_date_tuple = parsedate_tz(retry_after)
+ 69            if retry_date_tuple is None:
+ 70                return None
+ 71            retry_date = mktime_tz(retry_date_tuple)
+ 72            seconds = retry_date - time.time()
+ 73        return max(self.min, min(self.max, seconds))
+ 74
+ 75    def _get_retry_after(self, retry_state: RetryCallState) -> Optional[float]:
+ 76        response = _get_retry_response(retry_state)
+ 77        if response is None:
+ 78            return None
+ 79        header = response.headers.get("Retry-After")
+ 80        if not header:
+ 81            return None
+ 82        return self._parse_retry_after(header)
+ 83
+ 84    def __call__(self, retry_state: RetryCallState) -> float:
+ 85        retry_after = self._get_retry_after(retry_state)
+ 86        if retry_after is not None:
+ 87            return retry_after
+ 88        return super().__call__(retry_state)
+ 89
+ 90
+ 91def _make_retry(
+ 92    status_codes: Sequence[int],
+ 93    exceptions: Sequence[Type[Exception]],
+ 94    max_attempts: int,
+ 95    condition: Union[RetryPredicate, Sequence[RetryPredicate], None],
+ 96    backoff_factor: float,
+ 97    respect_retry_after_header: bool,
+ 98    max_delay: TimedeltaSeconds,
+ 99)-> Retrying:
+100    retry_conds = [retry_if_status(status_codes), retry_if_exception_type(tuple(exceptions))]
+101    if condition is not None:
+102        if callable(condition):
+103            retry_condition = [condition]
+104        retry_conds.extend([retry_if_predicate(c) for c in retry_condition])
+105
+106    wait_cls = wait_exponential_retry_after if respect_retry_after_header else wait_exponential
+107    return Retrying(
+108        wait=wait_cls(multiplier=backoff_factor, max=max_delay),
+109        retry=(retry_any(*retry_conds)),
+110        stop=stop_after_attempt(max_attempts),
+111        reraise=True,
+112        retry_error_callback=lambda state: state.outcome.result(),
+113    )
+114
+115
+116class Client:
+117    """Wrapper for `requests` to create a `Session` with configurable retry functionality.
+118
+119    ### Summary
+120    Create a  `requests.Session` which automatically retries requests in case of error.
+121    By default retries are triggered for `5xx` and `429` status codes and when the server is unreachable or drops connection.
+122
+123    ### Custom retry condition
+124    You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception.
+125    For example, this will trigger a retry when the response text is `error`:
+126
+127    >>> from typing import Optional
+128    >>> from requests import Response
+129    >>>
+130    >>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool:
+131    >>>     if response is None:
+132    >>>         return False
+133    >>>     return response.text == 'error'
+134
+135    The retry is triggered when either any of the predicates or the default conditions based on status code/exception are `True`.
+136
+137    ### Args:
+138        request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
+139        max_connections: Max connections per host in the HTTPAdapter pool
+140        raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
+141        session: Optional `requests.Session` instance to add the retry handler to. A new session is created by default.
+142        status_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status.
+143        exceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions.
+144        request_max_attempts: Max number of retry attempts before giving up
+145        retry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried
+146        request_backoff_factor: Multiplier used for exponential delay between retries
+147        request_max_retry_delay: Maximum delay when using exponential backoff
+148        respect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay
+149        session_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes)
+150    """
+151    _session_attrs: Dict[str, Any]
+152
+153    @with_config(spec=RunConfiguration)
+154    def __init__(
+155        self,
+156        request_timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
+157        max_connections: int = 50,
+158        raise_for_status: bool = True,
+159        status_codes: Sequence[int] = DEFAULT_RETRY_STATUS,
+160        exceptions: Sequence[Type[Exception]] = DEFAULT_RETRY_EXCEPTIONS,
+161        request_max_attempts: int = RunConfiguration.request_max_attempts,
+162        retry_condition: Union[RetryPredicate, Sequence[RetryPredicate], None] = None,
+163        request_backoff_factor: float = RunConfiguration.request_backoff_factor,
+164        request_max_retry_delay: TimedeltaSeconds = RunConfiguration.request_max_retry_delay,
+165        respect_retry_after_header: bool = True,
+166        session_attrs: Optional[Dict[str, Any]] = None,
+167    ) -> None:
+168        self._adapter = HTTPAdapter(pool_maxsize=max_connections)
+169        self._local = local()
+170        self._session_kwargs = dict(timeout=request_timeout, raise_for_status=raise_for_status)
+171        self._retry_kwargs: Dict[str, Any] = dict(
+172            status_codes=status_codes,
+173            exceptions=exceptions,
+174            max_attempts=request_max_attempts,
+175            condition=retry_condition,
+176            backoff_factor=request_backoff_factor,
+177            respect_retry_after_header=respect_retry_after_header,
+178            max_delay=request_max_retry_delay
+179        )
+180        self._session_attrs = session_attrs or {}
+181
+182        if TYPE_CHECKING:
+183            self.get = self.session.get
+184            self.post = self.session.post
+185            self.put = self.session.put
+186            self.patch = self.session.patch
+187            self.delete = self.session.delete
+188            self.head = self.session.head
+189            self.options = self.session.options
+190            self.request = self.session.request
+191
+192        self.get = lambda *a, **kw: self.session.get(*a, **kw)
+193        self.post = lambda *a, **kw: self.session.post(*a, **kw)
+194        self.put = lambda *a, **kw: self.session.put(*a, **kw)
+195        self.patch = lambda *a, **kw: self.session.patch(*a, **kw)
+196        self.delete = lambda *a, **kw: self.session.delete(*a, **kw)
+197        self.head = lambda *a, **kw: self.session.head(*a, **kw)
+198        self.options = lambda *a, **kw: self.session.options(*a, **kw)
+199        self.request = lambda *a, **kw: self.session.request(*a, **kw)
+200
+201        self._config_version: int = 0  # Incrementing marker to ensure per-thread sessions are recreated on config changes
+202
+203    def update_from_config(self, config: RunConfiguration) -> None:
+204        """Update session/retry settings from RunConfiguration"""
+205        self._session_kwargs['timeout'] = config.request_timeout
+206        self._retry_kwargs['backoff_factor'] = config.request_backoff_factor
+207        self._retry_kwargs['max_delay'] = config.request_max_retry_delay
+208        self._retry_kwargs['max_attempts'] = config.request_max_attempts
+209        self._config_version += 1
+210
+211    def _make_session(self) -> Session:
+212        session = Session(**self._session_kwargs)  # type: ignore[arg-type]
+213        for key, value in self._session_attrs.items():
+214            setattr(session, key, value)
+215        session.mount('http://', self._adapter)
+216        session.mount('https://', self._adapter)
+217        retry = _make_retry(**self._retry_kwargs)
+218        session.request = retry.wraps(session.request)  # type: ignore[method-assign]
+219        return session
+220
+221    @property
+222    def session(self) -> Session:
+223        session: Optional[Session] = getattr(self._local, 'session', None)
+224        version = self._config_version
+225        if session is not None:
+226            version = self._local.config_version
+227        if session is None or version != self._config_version:
+228            # Create a new session if config has changed
+229            session = self._local.session = self._make_session()
+230            self._local.config_version = self._config_version
+231        return session
+
+ + +
+
+
+ DEFAULT_RETRY_STATUS = + + (429, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599) + + +
+ + + + +
+
+
+ DEFAULT_RETRY_EXCEPTIONS = + + (<class 'requests.exceptions.ConnectionError'>, <class 'requests.exceptions.Timeout'>, <class 'requests.exceptions.ChunkedEncodingError'>) + + +
+ + + + +
+
+
+ RetryPredicate = +typing.Callable[[typing.Optional[requests.models.Response], typing.Optional[BaseException]], bool] + + +
+ + + + +
+
+ +
+ + class + retry_if_status(tenacity.retry.retry_base): + + + +
+ +
37class retry_if_status(retry_base):
+38    """Retry for given response status codes"""
+39
+40    def __init__(self, status_codes: Sequence[int]) -> None:
+41        self.status_codes = set(status_codes)
+42
+43    def __call__(self, retry_state: RetryCallState) -> bool:
+44        response = _get_retry_response(retry_state)
+45        if response is None:
+46            return False
+47        result = response.status_code in self.status_codes
+48        return result
+
+ + +

Retry for given response status codes

+
+ + +
+ +
+ + retry_if_status(status_codes: Sequence[int]) + + + +
+ +
40    def __init__(self, status_codes: Sequence[int]) -> None:
+41        self.status_codes = set(status_codes)
+
+ + + + +
+
+
+ status_codes + + +
+ + + + +
+
+
+ +
+ + class + retry_if_predicate(tenacity.retry.retry_base): + + + +
+ +
51class retry_if_predicate(retry_base):
+52    def __init__(self, predicate: RetryPredicate) -> None:
+53        self.predicate = predicate
+54
+55    def __call__(self, retry_state: RetryCallState) -> bool:
+56        response = _get_retry_response(retry_state)
+57        exception = retry_state.outcome.exception()
+58        return self.predicate(response, exception)
+
+ + +

Abstract base class for retry strategies.

+
+ + +
+ +
+ + retry_if_predicate( predicate: Callable[[Optional[requests.models.Response], Optional[BaseException]], bool]) + + + +
+ +
52    def __init__(self, predicate: RetryPredicate) -> None:
+53        self.predicate = predicate
+
+ + + + +
+
+
+ predicate + + +
+ + + + +
+
+
+ +
+ + class + wait_exponential_retry_after(tenacity.wait.wait_exponential): + + + +
+ +
61class wait_exponential_retry_after(wait_exponential):
+62    def _parse_retry_after(self, retry_after: str) -> Optional[float]:
+63        # Borrowed from urllib3
+64        seconds: float
+65        # Whitespace: https://tools.ietf.org/html/rfc7230#section-3.2.4
+66        if re.match(r"^\s*[0-9]+\s*$", retry_after):
+67            seconds = int(retry_after)
+68        else:
+69            retry_date_tuple = parsedate_tz(retry_after)
+70            if retry_date_tuple is None:
+71                return None
+72            retry_date = mktime_tz(retry_date_tuple)
+73            seconds = retry_date - time.time()
+74        return max(self.min, min(self.max, seconds))
+75
+76    def _get_retry_after(self, retry_state: RetryCallState) -> Optional[float]:
+77        response = _get_retry_response(retry_state)
+78        if response is None:
+79            return None
+80        header = response.headers.get("Retry-After")
+81        if not header:
+82            return None
+83        return self._parse_retry_after(header)
+84
+85    def __call__(self, retry_state: RetryCallState) -> float:
+86        retry_after = self._get_retry_after(retry_state)
+87        if retry_after is not None:
+88            return retry_after
+89        return super().__call__(retry_state)
+
+ + +

Wait strategy that applies exponential backoff.

+ +

It allows for a customized multiplier and an ability to restrict the +upper and lower limits to some maximum and minimum value.

+ +

The intervals are fixed (i.e. there is no jitter), so this strategy is +suitable for balancing retries against latency when a required resource is +unavailable for an unknown duration, but not suitable for resolving +contention between multiple processes for a shared resource. Use +wait_random_exponential for the latter case.

+
+ + +
+
Inherited Members
+
+
tenacity.wait.wait_exponential
+
wait_exponential
+
multiplier
+
min
+
max
+
exp_base
+ +
+
+
+
+
+ +
+ + class + Client: + + + +
+ +
117class Client:
+118    """Wrapper for `requests` to create a `Session` with configurable retry functionality.
+119
+120    ### Summary
+121    Create a  `requests.Session` which automatically retries requests in case of error.
+122    By default retries are triggered for `5xx` and `429` status codes and when the server is unreachable or drops connection.
+123
+124    ### Custom retry condition
+125    You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception.
+126    For example, this will trigger a retry when the response text is `error`:
+127
+128    >>> from typing import Optional
+129    >>> from requests import Response
+130    >>>
+131    >>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool:
+132    >>>     if response is None:
+133    >>>         return False
+134    >>>     return response.text == 'error'
+135
+136    The retry is triggered when either any of the predicates or the default conditions based on status code/exception are `True`.
+137
+138    ### Args:
+139        request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
+140        max_connections: Max connections per host in the HTTPAdapter pool
+141        raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
+142        session: Optional `requests.Session` instance to add the retry handler to. A new session is created by default.
+143        status_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status.
+144        exceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions.
+145        request_max_attempts: Max number of retry attempts before giving up
+146        retry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried
+147        request_backoff_factor: Multiplier used for exponential delay between retries
+148        request_max_retry_delay: Maximum delay when using exponential backoff
+149        respect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay
+150        session_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes)
+151    """
+152    _session_attrs: Dict[str, Any]
+153
+154    @with_config(spec=RunConfiguration)
+155    def __init__(
+156        self,
+157        request_timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
+158        max_connections: int = 50,
+159        raise_for_status: bool = True,
+160        status_codes: Sequence[int] = DEFAULT_RETRY_STATUS,
+161        exceptions: Sequence[Type[Exception]] = DEFAULT_RETRY_EXCEPTIONS,
+162        request_max_attempts: int = RunConfiguration.request_max_attempts,
+163        retry_condition: Union[RetryPredicate, Sequence[RetryPredicate], None] = None,
+164        request_backoff_factor: float = RunConfiguration.request_backoff_factor,
+165        request_max_retry_delay: TimedeltaSeconds = RunConfiguration.request_max_retry_delay,
+166        respect_retry_after_header: bool = True,
+167        session_attrs: Optional[Dict[str, Any]] = None,
+168    ) -> None:
+169        self._adapter = HTTPAdapter(pool_maxsize=max_connections)
+170        self._local = local()
+171        self._session_kwargs = dict(timeout=request_timeout, raise_for_status=raise_for_status)
+172        self._retry_kwargs: Dict[str, Any] = dict(
+173            status_codes=status_codes,
+174            exceptions=exceptions,
+175            max_attempts=request_max_attempts,
+176            condition=retry_condition,
+177            backoff_factor=request_backoff_factor,
+178            respect_retry_after_header=respect_retry_after_header,
+179            max_delay=request_max_retry_delay
+180        )
+181        self._session_attrs = session_attrs or {}
+182
+183        if TYPE_CHECKING:
+184            self.get = self.session.get
+185            self.post = self.session.post
+186            self.put = self.session.put
+187            self.patch = self.session.patch
+188            self.delete = self.session.delete
+189            self.head = self.session.head
+190            self.options = self.session.options
+191            self.request = self.session.request
+192
+193        self.get = lambda *a, **kw: self.session.get(*a, **kw)
+194        self.post = lambda *a, **kw: self.session.post(*a, **kw)
+195        self.put = lambda *a, **kw: self.session.put(*a, **kw)
+196        self.patch = lambda *a, **kw: self.session.patch(*a, **kw)
+197        self.delete = lambda *a, **kw: self.session.delete(*a, **kw)
+198        self.head = lambda *a, **kw: self.session.head(*a, **kw)
+199        self.options = lambda *a, **kw: self.session.options(*a, **kw)
+200        self.request = lambda *a, **kw: self.session.request(*a, **kw)
+201
+202        self._config_version: int = 0  # Incrementing marker to ensure per-thread sessions are recreated on config changes
+203
+204    def update_from_config(self, config: RunConfiguration) -> None:
+205        """Update session/retry settings from RunConfiguration"""
+206        self._session_kwargs['timeout'] = config.request_timeout
+207        self._retry_kwargs['backoff_factor'] = config.request_backoff_factor
+208        self._retry_kwargs['max_delay'] = config.request_max_retry_delay
+209        self._retry_kwargs['max_attempts'] = config.request_max_attempts
+210        self._config_version += 1
+211
+212    def _make_session(self) -> Session:
+213        session = Session(**self._session_kwargs)  # type: ignore[arg-type]
+214        for key, value in self._session_attrs.items():
+215            setattr(session, key, value)
+216        session.mount('http://', self._adapter)
+217        session.mount('https://', self._adapter)
+218        retry = _make_retry(**self._retry_kwargs)
+219        session.request = retry.wraps(session.request)  # type: ignore[method-assign]
+220        return session
+221
+222    @property
+223    def session(self) -> Session:
+224        session: Optional[Session] = getattr(self._local, 'session', None)
+225        version = self._config_version
+226        if session is not None:
+227            version = self._local.config_version
+228        if session is None or version != self._config_version:
+229            # Create a new session if config has changed
+230            session = self._local.session = self._make_session()
+231            self._local.config_version = self._config_version
+232        return session
+
+ + +

Wrapper for requests to create a Session with configurable retry functionality.

+ +

Summary

+ +

Create a requests.Session which automatically retries requests in case of error. +By default retries are triggered for 5xx and 429 status codes and when the server is unreachable or drops connection.

+ +

Custom retry condition

+ +

You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception. +For example, this will trigger a retry when the response text is error:

+ +
+
>>> from typing import Optional
+>>> from requests import Response
+>>>
+>>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool:
+>>>     if response is None:
+>>>         return False
+>>>     return response.text == 'error'
+
+
+ +

The retry is triggered when either any of the predicates or the default conditions based on status code/exception are True.

+ +

Args:

+ +
request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
+max_connections: Max connections per host in the HTTPAdapter pool
+raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
+session: Optional `requests.Session` instance to add the retry handler to. A new session is created by default.
+status_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status.
+exceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions.
+request_max_attempts: Max number of retry attempts before giving up
+retry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried
+request_backoff_factor: Multiplier used for exponential delay between retries
+request_max_retry_delay: Maximum delay when using exponential backoff
+respect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay
+session_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes)
+
+
+ + +
+ +
+
@with_config(spec=RunConfiguration)
+ + Client( request_timeout: Union[int, float, datetime.timedelta, Tuple[Union[int, float, datetime.timedelta], Union[int, float, datetime.timedelta]], NoneType] = 60, max_connections: int = 50, raise_for_status: bool = True, status_codes: Sequence[int] = (429, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599), exceptions: Sequence[Type[Exception]] = (<class 'requests.exceptions.ConnectionError'>, <class 'requests.exceptions.Timeout'>, <class 'requests.exceptions.ChunkedEncodingError'>), request_max_attempts: int = 5, retry_condition: Union[Callable[[Optional[requests.models.Response], Optional[BaseException]], bool], Sequence[Callable[[Optional[requests.models.Response], Optional[BaseException]], bool]], NoneType] = None, request_backoff_factor: float = 1, request_max_retry_delay: Union[int, float, datetime.timedelta] = 300, respect_retry_after_header: bool = True, session_attrs: Optional[Dict[str, Any]] = None) + + + +
+ +
154    @with_config(spec=RunConfiguration)
+155    def __init__(
+156        self,
+157        request_timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
+158        max_connections: int = 50,
+159        raise_for_status: bool = True,
+160        status_codes: Sequence[int] = DEFAULT_RETRY_STATUS,
+161        exceptions: Sequence[Type[Exception]] = DEFAULT_RETRY_EXCEPTIONS,
+162        request_max_attempts: int = RunConfiguration.request_max_attempts,
+163        retry_condition: Union[RetryPredicate, Sequence[RetryPredicate], None] = None,
+164        request_backoff_factor: float = RunConfiguration.request_backoff_factor,
+165        request_max_retry_delay: TimedeltaSeconds = RunConfiguration.request_max_retry_delay,
+166        respect_retry_after_header: bool = True,
+167        session_attrs: Optional[Dict[str, Any]] = None,
+168    ) -> None:
+169        self._adapter = HTTPAdapter(pool_maxsize=max_connections)
+170        self._local = local()
+171        self._session_kwargs = dict(timeout=request_timeout, raise_for_status=raise_for_status)
+172        self._retry_kwargs: Dict[str, Any] = dict(
+173            status_codes=status_codes,
+174            exceptions=exceptions,
+175            max_attempts=request_max_attempts,
+176            condition=retry_condition,
+177            backoff_factor=request_backoff_factor,
+178            respect_retry_after_header=respect_retry_after_header,
+179            max_delay=request_max_retry_delay
+180        )
+181        self._session_attrs = session_attrs or {}
+182
+183        if TYPE_CHECKING:
+184            self.get = self.session.get
+185            self.post = self.session.post
+186            self.put = self.session.put
+187            self.patch = self.session.patch
+188            self.delete = self.session.delete
+189            self.head = self.session.head
+190            self.options = self.session.options
+191            self.request = self.session.request
+192
+193        self.get = lambda *a, **kw: self.session.get(*a, **kw)
+194        self.post = lambda *a, **kw: self.session.post(*a, **kw)
+195        self.put = lambda *a, **kw: self.session.put(*a, **kw)
+196        self.patch = lambda *a, **kw: self.session.patch(*a, **kw)
+197        self.delete = lambda *a, **kw: self.session.delete(*a, **kw)
+198        self.head = lambda *a, **kw: self.session.head(*a, **kw)
+199        self.options = lambda *a, **kw: self.session.options(*a, **kw)
+200        self.request = lambda *a, **kw: self.session.request(*a, **kw)
+201
+202        self._config_version: int = 0  # Incrementing marker to ensure per-thread sessions are recreated on config changes
+
+ + + + +
+
+
+ get + + +
+ + + + +
+
+
+ post + + +
+ + + + +
+
+
+ put + + +
+ + + + +
+
+
+ patch + + +
+ + + + +
+
+
+ delete + + +
+ + + + +
+
+
+ head + + +
+ + + + +
+
+
+ options + + +
+ + + + +
+
+
+ request + + +
+ + + + +
+
+ +
+ + def + update_from_config( self, config: dlt.common.configuration.specs.run_configuration.RunConfiguration) -> None: + + + +
+ +
204    def update_from_config(self, config: RunConfiguration) -> None:
+205        """Update session/retry settings from RunConfiguration"""
+206        self._session_kwargs['timeout'] = config.request_timeout
+207        self._retry_kwargs['backoff_factor'] = config.request_backoff_factor
+208        self._retry_kwargs['max_delay'] = config.request_max_retry_delay
+209        self._retry_kwargs['max_attempts'] = config.request_max_attempts
+210        self._config_version += 1
+
+ + +

Update session/retry settings from RunConfiguration

+
+ + +
+
+ + + + + +
+
+
+ + \ No newline at end of file diff --git a/docs/website/docs/api_reference/dlt/sources/helpers/requests/session.html b/docs/website/docs/api_reference/dlt/sources/helpers/requests/session.html new file mode 100644 index 0000000000..d2c57dfc28 --- /dev/null +++ b/docs/website/docs/api_reference/dlt/sources/helpers/requests/session.html @@ -0,0 +1,534 @@ + + + + + + + dlt.sources.helpers.requests.session API documentation + + + + + + + + + +
+
+

+dlt.sources.helpers.requests.session

+ + + + + + +
 1from requests import Session as BaseSession
+ 2from tenacity import Retrying, retry_if_exception_type
+ 3from typing import Optional, TYPE_CHECKING, Sequence, Union, Tuple, Type, TypeVar
+ 4
+ 5from dlt.sources.helpers.requests.typing import TRequestTimeout
+ 6from dlt.common.typing import TimedeltaSeconds
+ 7from dlt.common.time import to_seconds
+ 8from dlt.version import __version__
+ 9
+10
+11TSession = TypeVar("TSession", bound=BaseSession)
+12
+13
+14DEFAULT_TIMEOUT = 60
+15
+16
+17def _timeout_to_seconds(timeout: TRequestTimeout) -> Optional[Union[Tuple[float, float], float]]:
+18    return (to_seconds(timeout[0]), to_seconds(timeout[1])) if isinstance(timeout, tuple) else to_seconds(timeout)
+19
+20
+21class Session(BaseSession):
+22    """Requests session which by default adds a timeout to all requests and calls `raise_for_status()` on response
+23
+24    ### Args
+25        timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
+26            May be a single value or a tuple for separate (connect, read) timeout.
+27        raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
+28    """
+29    def __init__(
+30        self,
+31        timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
+32        raise_for_status: bool = True,
+33    ) -> None:
+34        super().__init__()
+35        self.timeout = _timeout_to_seconds(timeout)
+36        self.raise_for_status = raise_for_status
+37        self.headers.update({
+38            "User-Agent": f"dlt/{__version__}",
+39        })
+40
+41    if TYPE_CHECKING:
+42        request = BaseSession.request
+43
+44    def request(self, *args, **kwargs):  # type: ignore[no-untyped-def,no-redef]
+45        kwargs.setdefault('timeout', self.timeout)
+46        resp = super().request(*args, **kwargs)
+47        if self.raise_for_status:
+48            resp.raise_for_status()
+49        return resp
+
+ + +
+
+
+ DEFAULT_TIMEOUT = +60 + + +
+ + + + +
+
+ +
+ + class + Session(requests.sessions.Session): + + + +
+ +
22class Session(BaseSession):
+23    """Requests session which by default adds a timeout to all requests and calls `raise_for_status()` on response
+24
+25    ### Args
+26        timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
+27            May be a single value or a tuple for separate (connect, read) timeout.
+28        raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
+29    """
+30    def __init__(
+31        self,
+32        timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
+33        raise_for_status: bool = True,
+34    ) -> None:
+35        super().__init__()
+36        self.timeout = _timeout_to_seconds(timeout)
+37        self.raise_for_status = raise_for_status
+38        self.headers.update({
+39            "User-Agent": f"dlt/{__version__}",
+40        })
+41
+42    if TYPE_CHECKING:
+43        request = BaseSession.request
+44
+45    def request(self, *args, **kwargs):  # type: ignore[no-untyped-def,no-redef]
+46        kwargs.setdefault('timeout', self.timeout)
+47        resp = super().request(*args, **kwargs)
+48        if self.raise_for_status:
+49            resp.raise_for_status()
+50        return resp
+
+ + +

Requests session which by default adds a timeout to all requests and calls raise_for_status() on response

+ +

Args

+ +
timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
+    May be a single value or a tuple for separate (connect, read) timeout.
+raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
+
+
+ + +
+ +
+ + Session( timeout: Union[int, float, datetime.timedelta, Tuple[Union[int, float, datetime.timedelta], Union[int, float, datetime.timedelta]], NoneType] = 60, raise_for_status: bool = True) + + + +
+ +
30    def __init__(
+31        self,
+32        timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
+33        raise_for_status: bool = True,
+34    ) -> None:
+35        super().__init__()
+36        self.timeout = _timeout_to_seconds(timeout)
+37        self.raise_for_status = raise_for_status
+38        self.headers.update({
+39            "User-Agent": f"dlt/{__version__}",
+40        })
+
+ + + + +
+
+
+ timeout + + +
+ + + + +
+
+
+ raise_for_status + + +
+ + + + +
+
+ +
+ + def + request(self, *args, **kwargs): + + + +
+ +
45    def request(self, *args, **kwargs):  # type: ignore[no-untyped-def,no-redef]
+46        kwargs.setdefault('timeout', self.timeout)
+47        resp = super().request(*args, **kwargs)
+48        if self.raise_for_status:
+49            resp.raise_for_status()
+50        return resp
+
+ + +

Constructs a Request <Request>, prepares it and sends it. +Returns Response <Response> object.

+ +
Parameters
+ +
    +
  • method: method for the new Request object.
  • +
  • url: URL for the new Request object.
  • +
  • params: (optional) Dictionary or bytes to be sent in the query +string for the Request.
  • +
  • data: (optional) Dictionary, list of tuples, bytes, or file-like +object to send in the body of the Request.
  • +
  • json: (optional) json to send in the body of the +Request.
  • +
  • headers: (optional) Dictionary of HTTP Headers to send with the +Request.
  • +
  • cookies: (optional) Dict or CookieJar object to send with the +Request.
  • +
  • **files: (optional) Dictionary of 'filename'**: file-like-objects +for multipart encoding upload.
  • +
  • auth: (optional) Auth tuple or callable to enable +Basic/Digest/Custom HTTP Auth.
  • +
  • timeout: (optional) How long to wait for the server to send +data before giving up, as a float, or a :ref:(connect timeout, +read timeout) <timeouts> tuple.
  • +
  • allow_redirects: (optional) Set to True by default.
  • +
  • proxies: (optional) Dictionary mapping protocol or protocol and +hostname to the URL of the proxy.
  • +
  • stream: (optional) whether to immediately download the response +content. Defaults to False.
  • +
  • verify: (optional) Either a boolean, in which case it controls whether we verify +the server's TLS certificate, or a string, in which case it must be a path +to a CA bundle to use. Defaults to True. When set to +False, requests will accept any TLS certificate presented by +the server, and will ignore hostname mismatches and/or expired +certificates, which will make your application vulnerable to +man-in-the-middle (MitM) attacks. Setting verify to False +may be useful during local development or testing.
  • +
  • cert: (optional) if String, path to ssl client cert file (.pem). +If Tuple, ('cert', 'key') pair.
  • +
+
+ + +
+
+
Inherited Members
+
+
requests.sessions.Session
+
headers
+
auth
+
proxies
+
hooks
+
params
+
stream
+
verify
+
cert
+
max_redirects
+
trust_env
+
cookies
+
adapters
+
prepare_request
+
get
+
options
+
head
+
post
+
put
+
patch
+
delete
+
send
+
merge_environment_settings
+
get_adapter
+
close
+
mount
+ +
+
requests.sessions.SessionRedirectMixin
+
get_redirect_target
+
should_strip_auth
+
resolve_redirects
+
rebuild_auth
+
rebuild_proxies
+
rebuild_method
+ +
+
+
+
+
+ + \ No newline at end of file diff --git a/docs/website/docs/api_reference/dlt/sources/helpers/requests/typing.html b/docs/website/docs/api_reference/dlt/sources/helpers/requests/typing.html new file mode 100644 index 0000000000..a7add0503e --- /dev/null +++ b/docs/website/docs/api_reference/dlt/sources/helpers/requests/typing.html @@ -0,0 +1,263 @@ + + + + + + + dlt.sources.helpers.requests.typing API documentation + + + + + + + + + +
+
+

+dlt.sources.helpers.requests.typing

+ + + + + + +
1from typing import Tuple, Union, Optional
+2
+3from dlt.common.typing import TimedeltaSeconds
+4
+5# Either a single timeout or tuple (connect,read) timeout
+6TRequestTimeout = Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]
+
+ + +
+
+
+ TRequestTimeout = + + typing.Union[int, float, datetime.timedelta, typing.Tuple[typing.Union[int, float, datetime.timedelta], typing.Union[int, float, datetime.timedelta]]] + + +
+ + + + +
+
+ + \ No newline at end of file diff --git a/docs/website/docs/api_reference/dlt/sources/helpers/transform.html b/docs/website/docs/api_reference/dlt/sources/helpers/transform.html new file mode 100644 index 0000000000..67e381cefd --- /dev/null +++ b/docs/website/docs/api_reference/dlt/sources/helpers/transform.html @@ -0,0 +1,323 @@ + + + + + + + dlt.sources.helpers.transform API documentation + + + + + + + + + +
+
+

+dlt.sources.helpers.transform

+ + + + + + +
 1from dlt.common.typing import TDataItem
+ 2from dlt.extract.typing import ItemTransformFunctionNoMeta
+ 3
+ 4
+ 5def take_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]:
+ 6    """A filter that takes only first `max_items` from a resource"""
+ 7    count: int = 0
+ 8    def _filter(_: TDataItem) -> bool:
+ 9        nonlocal count
+10        count += 1
+11        return count <= max_items
+12    return _filter
+13
+14
+15def skip_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]:
+16    """A filter that skips first `max_items` from a resource"""
+17    count: int = 0
+18    def _filter(_: TDataItem) -> bool:
+19        nonlocal count
+20        count += 1
+21        return count > max_items
+22    return _filter
+
+ + +
+
+ +
+ + def + take_first(max_items: int) -> Callable[[Any], bool]: + + + +
+ +
 6def take_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]:
+ 7    """A filter that takes only first `max_items` from a resource"""
+ 8    count: int = 0
+ 9    def _filter(_: TDataItem) -> bool:
+10        nonlocal count
+11        count += 1
+12        return count <= max_items
+13    return _filter
+
+ + +

A filter that takes only first max_items from a resource

+
+ + +
+
+ +
+ + def + skip_first(max_items: int) -> Callable[[Any], bool]: + + + +
+ +
16def skip_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]:
+17    """A filter that skips first `max_items` from a resource"""
+18    count: int = 0
+19    def _filter(_: TDataItem) -> bool:
+20        nonlocal count
+21        count += 1
+22        return count > max_items
+23    return _filter
+
+ + +

A filter that skips first max_items from a resource

+
+ + +
+
+ + \ No newline at end of file diff --git a/docs/website/docs/api_reference/index.html b/docs/website/docs/api_reference/index.html new file mode 100644 index 0000000000..a7829f45b5 --- /dev/null +++ b/docs/website/docs/api_reference/index.html @@ -0,0 +1,7 @@ + + + + + + + diff --git a/docs/website/docs/api_reference/search.js b/docs/website/docs/api_reference/search.js new file mode 100644 index 0000000000..a8a8b9f84d --- /dev/null +++ b/docs/website/docs/api_reference/search.js @@ -0,0 +1,46 @@ +window.pdocSearch = (function(){ +/** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();o

\n"}, {"fullname": "dlt.sources", "modulename": "dlt.sources", "kind": "module", "doc": "

Module with built in sources and source building blocks

\n"}, {"fullname": "dlt.sources.helpers", "modulename": "dlt.sources.helpers", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests", "modulename": "dlt.sources.helpers.requests", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.client", "modulename": "dlt.sources.helpers.requests", "qualname": "client", "kind": "variable", "doc": "

\n", "default_value": "<dlt.sources.helpers.requests.retry.Client object>"}, {"fullname": "dlt.sources.helpers.requests.init", "modulename": "dlt.sources.helpers.requests", "qualname": "init", "kind": "function", "doc": "

Initialize the default requests client from config

\n", "signature": "(\tconfig: dlt.common.configuration.specs.run_configuration.RunConfiguration) -> None:", "funcdef": "def"}, {"fullname": "dlt.sources.helpers.requests.retry", "modulename": "dlt.sources.helpers.requests.retry", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.DEFAULT_RETRY_STATUS", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "DEFAULT_RETRY_STATUS", "kind": "variable", "doc": "

\n", "default_value": "(429, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599)"}, {"fullname": "dlt.sources.helpers.requests.retry.DEFAULT_RETRY_EXCEPTIONS", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "DEFAULT_RETRY_EXCEPTIONS", "kind": "variable", "doc": "

\n", "default_value": "(<class 'requests.exceptions.ConnectionError'>, <class 'requests.exceptions.Timeout'>, <class 'requests.exceptions.ChunkedEncodingError'>)"}, {"fullname": "dlt.sources.helpers.requests.retry.RetryPredicate", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "RetryPredicate", "kind": "variable", "doc": "

\n", "default_value": "typing.Callable[[typing.Optional[requests.models.Response], typing.Optional[BaseException]], bool]"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_status", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_status", "kind": "class", "doc": "

Retry for given response status codes

\n", "bases": "tenacity.retry.retry_base"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_status.__init__", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_status.__init__", "kind": "function", "doc": "

\n", "signature": "(status_codes: Sequence[int])"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_status.status_codes", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_status.status_codes", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_predicate", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_predicate", "kind": "class", "doc": "

Abstract base class for retry strategies.

\n", "bases": "tenacity.retry.retry_base"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_predicate.__init__", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_predicate.__init__", "kind": "function", "doc": "

\n", "signature": "(\tpredicate: Callable[[Optional[requests.models.Response], Optional[BaseException]], bool])"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_predicate.predicate", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_predicate.predicate", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.wait_exponential_retry_after", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "wait_exponential_retry_after", "kind": "class", "doc": "

Wait strategy that applies exponential backoff.

\n\n

It allows for a customized multiplier and an ability to restrict the\nupper and lower limits to some maximum and minimum value.

\n\n

The intervals are fixed (i.e. there is no jitter), so this strategy is\nsuitable for balancing retries against latency when a required resource is\nunavailable for an unknown duration, but not suitable for resolving\ncontention between multiple processes for a shared resource. Use\nwait_random_exponential for the latter case.

\n", "bases": "tenacity.wait.wait_exponential"}, {"fullname": "dlt.sources.helpers.requests.retry.Client", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client", "kind": "class", "doc": "

Wrapper for requests to create a Session with configurable retry functionality.

\n\n

Summary

\n\n

Create a requests.Session which automatically retries requests in case of error.\nBy default retries are triggered for 5xx and 429 status codes and when the server is unreachable or drops connection.

\n\n

Custom retry condition

\n\n

You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception.\nFor example, this will trigger a retry when the response text is error:

\n\n
\n
>>> from typing import Optional\n>>> from requests import Response\n>>>\n>>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool:\n>>>     if response is None:\n>>>         return False\n>>>     return response.text == 'error'\n
\n
\n\n

The retry is triggered when either any of the predicates or the default conditions based on status code/exception are True.

\n\n

Args:

\n\n
request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.\nmax_connections: Max connections per host in the HTTPAdapter pool\nraise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)\nsession: Optional `requests.Session` instance to add the retry handler to. A new session is created by default.\nstatus_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status.\nexceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions.\nrequest_max_attempts: Max number of retry attempts before giving up\nretry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried\nrequest_backoff_factor: Multiplier used for exponential delay between retries\nrequest_max_retry_delay: Maximum delay when using exponential backoff\nrespect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay\nsession_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes)\n
\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.__init__", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.__init__", "kind": "function", "doc": "

\n", "signature": "(\trequest_timeout: Union[int, float, datetime.timedelta, Tuple[Union[int, float, datetime.timedelta], Union[int, float, datetime.timedelta]], NoneType] = 60,\tmax_connections: int = 50,\traise_for_status: bool = True,\tstatus_codes: Sequence[int] = (429, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599),\texceptions: Sequence[Type[Exception]] = (<class 'requests.exceptions.ConnectionError'>, <class 'requests.exceptions.Timeout'>, <class 'requests.exceptions.ChunkedEncodingError'>),\trequest_max_attempts: int = 5,\tretry_condition: Union[Callable[[Optional[requests.models.Response], Optional[BaseException]], bool], Sequence[Callable[[Optional[requests.models.Response], Optional[BaseException]], bool]], NoneType] = None,\trequest_backoff_factor: float = 1,\trequest_max_retry_delay: Union[int, float, datetime.timedelta] = 300,\trespect_retry_after_header: bool = True,\tsession_attrs: Optional[Dict[str, Any]] = None)"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.get", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.get", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.post", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.post", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.put", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.put", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.patch", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.patch", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.delete", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.delete", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.head", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.head", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.options", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.options", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.request", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.request", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.update_from_config", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.update_from_config", "kind": "function", "doc": "

Update session/retry settings from RunConfiguration

\n", "signature": "(\tself,\tconfig: dlt.common.configuration.specs.run_configuration.RunConfiguration) -> None:", "funcdef": "def"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.session", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.session", "kind": "variable", "doc": "

\n", "annotation": ": dlt.sources.helpers.requests.session.Session"}, {"fullname": "dlt.sources.helpers.requests.session", "modulename": "dlt.sources.helpers.requests.session", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.session.DEFAULT_TIMEOUT", "modulename": "dlt.sources.helpers.requests.session", "qualname": "DEFAULT_TIMEOUT", "kind": "variable", "doc": "

\n", "default_value": "60"}, {"fullname": "dlt.sources.helpers.requests.session.Session", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session", "kind": "class", "doc": "

Requests session which by default adds a timeout to all requests and calls raise_for_status() on response

\n\n

Args

\n\n
timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.\n    May be a single value or a tuple for separate (connect, read) timeout.\nraise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)\n
\n", "bases": "requests.sessions.Session"}, {"fullname": "dlt.sources.helpers.requests.session.Session.__init__", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session.__init__", "kind": "function", "doc": "

\n", "signature": "(\ttimeout: Union[int, float, datetime.timedelta, Tuple[Union[int, float, datetime.timedelta], Union[int, float, datetime.timedelta]], NoneType] = 60,\traise_for_status: bool = True)"}, {"fullname": "dlt.sources.helpers.requests.session.Session.timeout", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session.timeout", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.session.Session.raise_for_status", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session.raise_for_status", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.session.Session.request", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session.request", "kind": "function", "doc": "

Constructs a Request <Request>, prepares it and sends it.\nReturns Response <Response> object.

\n\n
Parameters
\n\n
    \n
  • method: method for the new Request object.
  • \n
  • url: URL for the new Request object.
  • \n
  • params: (optional) Dictionary or bytes to be sent in the query\nstring for the Request.
  • \n
  • data: (optional) Dictionary, list of tuples, bytes, or file-like\nobject to send in the body of the Request.
  • \n
  • json: (optional) json to send in the body of the\nRequest.
  • \n
  • headers: (optional) Dictionary of HTTP Headers to send with the\nRequest.
  • \n
  • cookies: (optional) Dict or CookieJar object to send with the\nRequest.
  • \n
  • **files: (optional) Dictionary of 'filename'**: file-like-objects\nfor multipart encoding upload.
  • \n
  • auth: (optional) Auth tuple or callable to enable\nBasic/Digest/Custom HTTP Auth.
  • \n
  • timeout: (optional) How long to wait for the server to send\ndata before giving up, as a float, or a :ref:(connect timeout,\nread timeout) <timeouts> tuple.
  • \n
  • allow_redirects: (optional) Set to True by default.
  • \n
  • proxies: (optional) Dictionary mapping protocol or protocol and\nhostname to the URL of the proxy.
  • \n
  • stream: (optional) whether to immediately download the response\ncontent. Defaults to False.
  • \n
  • verify: (optional) Either a boolean, in which case it controls whether we verify\nthe server's TLS certificate, or a string, in which case it must be a path\nto a CA bundle to use. Defaults to True. When set to\nFalse, requests will accept any TLS certificate presented by\nthe server, and will ignore hostname mismatches and/or expired\ncertificates, which will make your application vulnerable to\nman-in-the-middle (MitM) attacks. Setting verify to False\nmay be useful during local development or testing.
  • \n
  • cert: (optional) if String, path to ssl client cert file (.pem).\nIf Tuple, ('cert', 'key') pair.
  • \n
\n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "dlt.sources.helpers.requests.typing", "modulename": "dlt.sources.helpers.requests.typing", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.typing.TRequestTimeout", "modulename": "dlt.sources.helpers.requests.typing", "qualname": "TRequestTimeout", "kind": "variable", "doc": "

\n", "default_value": "typing.Union[int, float, datetime.timedelta, typing.Tuple[typing.Union[int, float, datetime.timedelta], typing.Union[int, float, datetime.timedelta]]]"}, {"fullname": "dlt.sources.helpers.transform", "modulename": "dlt.sources.helpers.transform", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.transform.take_first", "modulename": "dlt.sources.helpers.transform", "qualname": "take_first", "kind": "function", "doc": "

A filter that takes only first max_items from a resource

\n", "signature": "(max_items: int) -> Callable[[Any], bool]:", "funcdef": "def"}, {"fullname": "dlt.sources.helpers.transform.skip_first", "modulename": "dlt.sources.helpers.transform", "qualname": "skip_first", "kind": "function", "doc": "

A filter that skips first max_items from a resource

\n", "signature": "(max_items: int) -> Callable[[Any], bool]:", "funcdef": "def"}]; + + // mirrored in build-search-index.js (part 1) + // Also split on html tags. this is a cheap heuristic, but good enough. + elasticlunr.tokenizer.setSeperator(/[\s\-.;&_'"=,()]+|<[^>]*>/); + + let searchIndex; + if (docs._isPrebuiltIndex) { + console.info("using precompiled search index"); + searchIndex = elasticlunr.Index.load(docs); + } else { + console.time("building search index"); + // mirrored in build-search-index.js (part 2) + searchIndex = elasticlunr(function () { + this.pipeline.remove(elasticlunr.stemmer); + this.pipeline.remove(elasticlunr.stopWordFilter); + this.addField("qualname"); + this.addField("fullname"); + this.addField("annotation"); + this.addField("default_value"); + this.addField("signature"); + this.addField("bases"); + this.addField("doc"); + this.setRef("fullname"); + }); + for (let doc of docs) { + searchIndex.addDoc(doc); + } + console.timeEnd("building search index"); + } + + return (term) => searchIndex.search(term, { + fields: { + qualname: {boost: 4}, + fullname: {boost: 2}, + annotation: {boost: 2}, + default_value: {boost: 2}, + signature: {boost: 2}, + bases: {boost: 2}, + doc: {boost: 1}, + }, + expand: true + }); +})(); \ No newline at end of file diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 9d17d6646b..fa18418039 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -263,6 +263,13 @@ const sidebars = { 'reference/explainers/schema-evolution', 'reference/telemetry', 'reference/airflow-gcp-cloud-composer', + { + type: 'category', + label: 'API Reference', + items: [ + 'api_reference/index.html', + ] + }, ], }, ] From 0099c9b362d2ecdd64b01166e7d605468e255585 Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Mon, 11 Sep 2023 18:39:09 +0200 Subject: [PATCH 02/10] [docs] move api ref to static --- docs/website/docs/api-reference.md | 5 + docs/website/sidebars.js | 2 +- .../api_reference/dlt/sources.html | 0 .../dlt/sources/credentials.html | 4 +- .../api_reference/dlt/sources/helpers.html | 10 +- .../dlt/sources/helpers/requests.html | 12 +- .../dlt/sources/helpers/requests/retry.html | 108 +++++++++--------- .../dlt/sources/helpers/requests/session.html | 30 ++--- .../dlt/sources/helpers/requests/typing.html | 10 +- .../dlt/sources/helpers/transform.html | 8 +- .../{docs => static}/api_reference/index.html | 0 .../{docs => static}/api_reference/search.js | 0 12 files changed, 97 insertions(+), 92 deletions(-) create mode 100644 docs/website/docs/api-reference.md rename docs/website/{docs => static}/api_reference/dlt/sources.html (100%) rename docs/website/{docs => static}/api_reference/dlt/sources/credentials.html (99%) rename docs/website/{docs => static}/api_reference/dlt/sources/helpers.html (99%) rename docs/website/{docs => static}/api_reference/dlt/sources/helpers/requests.html (99%) rename docs/website/{docs => static}/api_reference/dlt/sources/helpers/requests/retry.html (99%) rename docs/website/{docs => static}/api_reference/dlt/sources/helpers/requests/session.html (99%) rename docs/website/{docs => static}/api_reference/dlt/sources/helpers/requests/typing.html (99%) rename docs/website/{docs => static}/api_reference/dlt/sources/helpers/transform.html (99%) rename docs/website/{docs => static}/api_reference/index.html (100%) rename docs/website/{docs => static}/api_reference/search.js (100%) diff --git a/docs/website/docs/api-reference.md b/docs/website/docs/api-reference.md new file mode 100644 index 0000000000..4dc44dcc9e --- /dev/null +++ b/docs/website/docs/api-reference.md @@ -0,0 +1,5 @@ +--- +id: api_reference +title: API Reference +--- +[Reference](../static/api_reference/index.html) \ No newline at end of file diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index fa18418039..1295686a2a 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -267,7 +267,7 @@ const sidebars = { type: 'category', label: 'API Reference', items: [ - 'api_reference/index.html', + 'api-reference', ] }, ], diff --git a/docs/website/docs/api_reference/dlt/sources.html b/docs/website/static/api_reference/dlt/sources.html similarity index 100% rename from docs/website/docs/api_reference/dlt/sources.html rename to docs/website/static/api_reference/dlt/sources.html diff --git a/docs/website/docs/api_reference/dlt/sources/credentials.html b/docs/website/static/api_reference/dlt/sources/credentials.html similarity index 99% rename from docs/website/docs/api_reference/dlt/sources/credentials.html rename to docs/website/static/api_reference/dlt/sources/credentials.html index 67f2d9f0b5..634676c8d2 100644 --- a/docs/website/docs/api_reference/dlt/sources/credentials.html +++ b/docs/website/static/api_reference/dlt/sources/credentials.html @@ -40,9 +40,9 @@

-dlt.sources.credentials

+dlt.sources.credentials + - diff --git a/docs/website/docs/api_reference/dlt/sources/helpers.html b/docs/website/static/api_reference/dlt/sources/helpers.html similarity index 99% rename from docs/website/docs/api_reference/dlt/sources/helpers.html rename to docs/website/static/api_reference/dlt/sources/helpers.html index 2e51f30d90..0517a33bbb 100644 --- a/docs/website/docs/api_reference/dlt/sources/helpers.html +++ b/docs/website/static/api_reference/dlt/sources/helpers.html @@ -45,12 +45,12 @@

Submodules

-dlt.sources.helpers

+dlt.sources.helpers + + + + - - - -
- \ No newline at end of file diff --git a/docs/website/static/api_reference/dlt/sources/credentials.html b/docs/website/static/api_reference/dlt/sources/credentials.html deleted file mode 100644 index 634676c8d2..0000000000 --- a/docs/website/static/api_reference/dlt/sources/credentials.html +++ /dev/null @@ -1,241 +0,0 @@ - - - - - - - dlt.sources.credentials API documentation - - - - - - - - - -
-
-

-dlt.sources.credentials

- - - - - - -
1from dlt.common.configuration.specs import GcpServiceAccountCredentials, GcpOAuthCredentials, GcpCredentials
-2from dlt.common.configuration.specs import ConnectionStringCredentials
-3from dlt.common.configuration.specs import OAuth2Credentials
-4from dlt.common.configuration.specs import CredentialsConfiguration, configspec
-
- - -
-
- - \ No newline at end of file diff --git a/docs/website/static/api_reference/dlt/sources/helpers.html b/docs/website/static/api_reference/dlt/sources/helpers.html deleted file mode 100644 index 0517a33bbb..0000000000 --- a/docs/website/static/api_reference/dlt/sources/helpers.html +++ /dev/null @@ -1,238 +0,0 @@ - - - - - - - dlt.sources.helpers API documentation - - - - - - - - - -
-
-

-dlt.sources.helpers

- - - - - -
-
- - \ No newline at end of file diff --git a/docs/website/static/api_reference/dlt/sources/helpers/requests.html b/docs/website/static/api_reference/dlt/sources/helpers/requests.html deleted file mode 100644 index 71d35a232a..0000000000 --- a/docs/website/static/api_reference/dlt/sources/helpers/requests.html +++ /dev/null @@ -1,315 +0,0 @@ - - - - - - - dlt.sources.helpers.requests API documentation - - - - - - - - - -
-
-

-dlt.sources.helpers.requests

- - - - - - -
 1from tenacity import RetryError
- 2from requests import (
- 3    Request, Response,
- 4    ConnectionError,
- 5    ConnectTimeout,
- 6    FileModeWarning,
- 7    HTTPError,
- 8    ReadTimeout,
- 9    RequestException,
-10    Timeout,
-11    TooManyRedirects,
-12    URLRequired,
-13)
-14from requests.exceptions import ChunkedEncodingError
-15from dlt.sources.helpers.requests.retry import Client
-16from dlt.sources.helpers.requests.session import Session
-17from dlt.common.configuration.specs import RunConfiguration
-18
-19client = Client()
-20
-21get, post, put, patch, delete, options, head, request = (
-22    client.get, client.post, client.put, client.patch, client.delete, client.options, client.head, client.request
-23)
-24
-25
-26def init(config: RunConfiguration) -> None:
-27    """Initialize the default requests client from config"""
-28    client.update_from_config(config)
-
- - -
-
-
- client = -<dlt.sources.helpers.requests.retry.Client object> - - -
- - - - -
-
- -
- - def - init( config: dlt.common.configuration.specs.run_configuration.RunConfiguration) -> None: - - - -
- -
27def init(config: RunConfiguration) -> None:
-28    """Initialize the default requests client from config"""
-29    client.update_from_config(config)
-
- - -

Initialize the default requests client from config

-
- - -
-
- - \ No newline at end of file diff --git a/docs/website/static/api_reference/dlt/sources/helpers/requests/retry.html b/docs/website/static/api_reference/dlt/sources/helpers/requests/retry.html deleted file mode 100644 index 1ce08e230f..0000000000 --- a/docs/website/static/api_reference/dlt/sources/helpers/requests/retry.html +++ /dev/null @@ -1,1139 +0,0 @@ - - - - - - - dlt.sources.helpers.requests.retry API documentation - - - - - - - - - -
-
-

-dlt.sources.helpers.requests.retry

- - - - - - -
  1from email.utils import parsedate_tz, mktime_tz
-  2import re
-  3import time
-  4from typing import Optional, cast, Callable, Type, Union, Sequence, Tuple, List, TYPE_CHECKING, Any, Dict
-  5from threading import local
-  6
-  7from requests import Response, HTTPError, Session as BaseSession
-  8from requests.exceptions import ConnectionError, Timeout, ChunkedEncodingError
-  9from requests.adapters import HTTPAdapter
- 10from tenacity import Retrying, retry_if_exception_type, stop_after_attempt, RetryCallState, retry_any, wait_exponential
- 11from tenacity.retry import retry_base
- 12
- 13from dlt.sources.helpers.requests.session import Session, DEFAULT_TIMEOUT
- 14from dlt.sources.helpers.requests.typing import TRequestTimeout
- 15from dlt.common.typing import TimedeltaSeconds
- 16from dlt.common.configuration.specs import RunConfiguration
- 17from dlt.common.configuration import with_config
- 18
- 19
- 20DEFAULT_RETRY_STATUS = (429, *range(500, 600))
- 21DEFAULT_RETRY_EXCEPTIONS = (ConnectionError, Timeout, ChunkedEncodingError)
- 22
- 23RetryPredicate = Callable[[Optional[Response], Optional[BaseException]], bool]
- 24
- 25
- 26def _get_retry_response(retry_state: RetryCallState) -> Optional[Response]:
- 27    ex = retry_state.outcome.exception()
- 28    if ex:
- 29        if isinstance(ex, HTTPError):
- 30            return cast(Response, ex.response)
- 31        return None
- 32    result = retry_state.outcome.result()
- 33    return result if isinstance(result, Response) else None
- 34
- 35
- 36class retry_if_status(retry_base):
- 37    """Retry for given response status codes"""
- 38
- 39    def __init__(self, status_codes: Sequence[int]) -> None:
- 40        self.status_codes = set(status_codes)
- 41
- 42    def __call__(self, retry_state: RetryCallState) -> bool:
- 43        response = _get_retry_response(retry_state)
- 44        if response is None:
- 45            return False
- 46        result = response.status_code in self.status_codes
- 47        return result
- 48
- 49
- 50class retry_if_predicate(retry_base):
- 51    def __init__(self, predicate: RetryPredicate) -> None:
- 52        self.predicate = predicate
- 53
- 54    def __call__(self, retry_state: RetryCallState) -> bool:
- 55        response = _get_retry_response(retry_state)
- 56        exception = retry_state.outcome.exception()
- 57        return self.predicate(response, exception)
- 58
- 59
- 60class wait_exponential_retry_after(wait_exponential):
- 61    def _parse_retry_after(self, retry_after: str) -> Optional[float]:
- 62        # Borrowed from urllib3
- 63        seconds: float
- 64        # Whitespace: https://tools.ietf.org/html/rfc7230#section-3.2.4
- 65        if re.match(r"^\s*[0-9]+\s*$", retry_after):
- 66            seconds = int(retry_after)
- 67        else:
- 68            retry_date_tuple = parsedate_tz(retry_after)
- 69            if retry_date_tuple is None:
- 70                return None
- 71            retry_date = mktime_tz(retry_date_tuple)
- 72            seconds = retry_date - time.time()
- 73        return max(self.min, min(self.max, seconds))
- 74
- 75    def _get_retry_after(self, retry_state: RetryCallState) -> Optional[float]:
- 76        response = _get_retry_response(retry_state)
- 77        if response is None:
- 78            return None
- 79        header = response.headers.get("Retry-After")
- 80        if not header:
- 81            return None
- 82        return self._parse_retry_after(header)
- 83
- 84    def __call__(self, retry_state: RetryCallState) -> float:
- 85        retry_after = self._get_retry_after(retry_state)
- 86        if retry_after is not None:
- 87            return retry_after
- 88        return super().__call__(retry_state)
- 89
- 90
- 91def _make_retry(
- 92    status_codes: Sequence[int],
- 93    exceptions: Sequence[Type[Exception]],
- 94    max_attempts: int,
- 95    condition: Union[RetryPredicate, Sequence[RetryPredicate], None],
- 96    backoff_factor: float,
- 97    respect_retry_after_header: bool,
- 98    max_delay: TimedeltaSeconds,
- 99)-> Retrying:
-100    retry_conds = [retry_if_status(status_codes), retry_if_exception_type(tuple(exceptions))]
-101    if condition is not None:
-102        if callable(condition):
-103            retry_condition = [condition]
-104        retry_conds.extend([retry_if_predicate(c) for c in retry_condition])
-105
-106    wait_cls = wait_exponential_retry_after if respect_retry_after_header else wait_exponential
-107    return Retrying(
-108        wait=wait_cls(multiplier=backoff_factor, max=max_delay),
-109        retry=(retry_any(*retry_conds)),
-110        stop=stop_after_attempt(max_attempts),
-111        reraise=True,
-112        retry_error_callback=lambda state: state.outcome.result(),
-113    )
-114
-115
-116class Client:
-117    """Wrapper for `requests` to create a `Session` with configurable retry functionality.
-118
-119    ### Summary
-120    Create a  `requests.Session` which automatically retries requests in case of error.
-121    By default retries are triggered for `5xx` and `429` status codes and when the server is unreachable or drops connection.
-122
-123    ### Custom retry condition
-124    You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception.
-125    For example, this will trigger a retry when the response text is `error`:
-126
-127    >>> from typing import Optional
-128    >>> from requests import Response
-129    >>>
-130    >>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool:
-131    >>>     if response is None:
-132    >>>         return False
-133    >>>     return response.text == 'error'
-134
-135    The retry is triggered when either any of the predicates or the default conditions based on status code/exception are `True`.
-136
-137    ### Args:
-138        request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
-139        max_connections: Max connections per host in the HTTPAdapter pool
-140        raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
-141        session: Optional `requests.Session` instance to add the retry handler to. A new session is created by default.
-142        status_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status.
-143        exceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions.
-144        request_max_attempts: Max number of retry attempts before giving up
-145        retry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried
-146        request_backoff_factor: Multiplier used for exponential delay between retries
-147        request_max_retry_delay: Maximum delay when using exponential backoff
-148        respect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay
-149        session_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes)
-150    """
-151    _session_attrs: Dict[str, Any]
-152
-153    @with_config(spec=RunConfiguration)
-154    def __init__(
-155        self,
-156        request_timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
-157        max_connections: int = 50,
-158        raise_for_status: bool = True,
-159        status_codes: Sequence[int] = DEFAULT_RETRY_STATUS,
-160        exceptions: Sequence[Type[Exception]] = DEFAULT_RETRY_EXCEPTIONS,
-161        request_max_attempts: int = RunConfiguration.request_max_attempts,
-162        retry_condition: Union[RetryPredicate, Sequence[RetryPredicate], None] = None,
-163        request_backoff_factor: float = RunConfiguration.request_backoff_factor,
-164        request_max_retry_delay: TimedeltaSeconds = RunConfiguration.request_max_retry_delay,
-165        respect_retry_after_header: bool = True,
-166        session_attrs: Optional[Dict[str, Any]] = None,
-167    ) -> None:
-168        self._adapter = HTTPAdapter(pool_maxsize=max_connections)
-169        self._local = local()
-170        self._session_kwargs = dict(timeout=request_timeout, raise_for_status=raise_for_status)
-171        self._retry_kwargs: Dict[str, Any] = dict(
-172            status_codes=status_codes,
-173            exceptions=exceptions,
-174            max_attempts=request_max_attempts,
-175            condition=retry_condition,
-176            backoff_factor=request_backoff_factor,
-177            respect_retry_after_header=respect_retry_after_header,
-178            max_delay=request_max_retry_delay
-179        )
-180        self._session_attrs = session_attrs or {}
-181
-182        if TYPE_CHECKING:
-183            self.get = self.session.get
-184            self.post = self.session.post
-185            self.put = self.session.put
-186            self.patch = self.session.patch
-187            self.delete = self.session.delete
-188            self.head = self.session.head
-189            self.options = self.session.options
-190            self.request = self.session.request
-191
-192        self.get = lambda *a, **kw: self.session.get(*a, **kw)
-193        self.post = lambda *a, **kw: self.session.post(*a, **kw)
-194        self.put = lambda *a, **kw: self.session.put(*a, **kw)
-195        self.patch = lambda *a, **kw: self.session.patch(*a, **kw)
-196        self.delete = lambda *a, **kw: self.session.delete(*a, **kw)
-197        self.head = lambda *a, **kw: self.session.head(*a, **kw)
-198        self.options = lambda *a, **kw: self.session.options(*a, **kw)
-199        self.request = lambda *a, **kw: self.session.request(*a, **kw)
-200
-201        self._config_version: int = 0  # Incrementing marker to ensure per-thread sessions are recreated on config changes
-202
-203    def update_from_config(self, config: RunConfiguration) -> None:
-204        """Update session/retry settings from RunConfiguration"""
-205        self._session_kwargs['timeout'] = config.request_timeout
-206        self._retry_kwargs['backoff_factor'] = config.request_backoff_factor
-207        self._retry_kwargs['max_delay'] = config.request_max_retry_delay
-208        self._retry_kwargs['max_attempts'] = config.request_max_attempts
-209        self._config_version += 1
-210
-211    def _make_session(self) -> Session:
-212        session = Session(**self._session_kwargs)  # type: ignore[arg-type]
-213        for key, value in self._session_attrs.items():
-214            setattr(session, key, value)
-215        session.mount('http://', self._adapter)
-216        session.mount('https://', self._adapter)
-217        retry = _make_retry(**self._retry_kwargs)
-218        session.request = retry.wraps(session.request)  # type: ignore[method-assign]
-219        return session
-220
-221    @property
-222    def session(self) -> Session:
-223        session: Optional[Session] = getattr(self._local, 'session', None)
-224        version = self._config_version
-225        if session is not None:
-226            version = self._local.config_version
-227        if session is None or version != self._config_version:
-228            # Create a new session if config has changed
-229            session = self._local.session = self._make_session()
-230            self._local.config_version = self._config_version
-231        return session
-
- - -
-
-
- DEFAULT_RETRY_STATUS = - - (429, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599) - - -
- - - - -
-
-
- DEFAULT_RETRY_EXCEPTIONS = - - (<class 'requests.exceptions.ConnectionError'>, <class 'requests.exceptions.Timeout'>, <class 'requests.exceptions.ChunkedEncodingError'>) - - -
- - - - -
-
-
- RetryPredicate = -typing.Callable[[typing.Optional[requests.models.Response], typing.Optional[BaseException]], bool] - - -
- - - - -
-
- -
- - class - retry_if_status(tenacity.retry.retry_base): - - - -
- -
37class retry_if_status(retry_base):
-38    """Retry for given response status codes"""
-39
-40    def __init__(self, status_codes: Sequence[int]) -> None:
-41        self.status_codes = set(status_codes)
-42
-43    def __call__(self, retry_state: RetryCallState) -> bool:
-44        response = _get_retry_response(retry_state)
-45        if response is None:
-46            return False
-47        result = response.status_code in self.status_codes
-48        return result
-
- - -

Retry for given response status codes

-
- - -
- -
- - retry_if_status(status_codes: Sequence[int]) - - - -
- -
40    def __init__(self, status_codes: Sequence[int]) -> None:
-41        self.status_codes = set(status_codes)
-
- - - - -
-
-
- status_codes - - -
- - - - -
-
-
- -
- - class - retry_if_predicate(tenacity.retry.retry_base): - - - -
- -
51class retry_if_predicate(retry_base):
-52    def __init__(self, predicate: RetryPredicate) -> None:
-53        self.predicate = predicate
-54
-55    def __call__(self, retry_state: RetryCallState) -> bool:
-56        response = _get_retry_response(retry_state)
-57        exception = retry_state.outcome.exception()
-58        return self.predicate(response, exception)
-
- - -

Abstract base class for retry strategies.

-
- - -
- -
- - retry_if_predicate( predicate: Callable[[Optional[requests.models.Response], Optional[BaseException]], bool]) - - - -
- -
52    def __init__(self, predicate: RetryPredicate) -> None:
-53        self.predicate = predicate
-
- - - - -
-
-
- predicate - - -
- - - - -
-
-
- -
- - class - wait_exponential_retry_after(tenacity.wait.wait_exponential): - - - -
- -
61class wait_exponential_retry_after(wait_exponential):
-62    def _parse_retry_after(self, retry_after: str) -> Optional[float]:
-63        # Borrowed from urllib3
-64        seconds: float
-65        # Whitespace: https://tools.ietf.org/html/rfc7230#section-3.2.4
-66        if re.match(r"^\s*[0-9]+\s*$", retry_after):
-67            seconds = int(retry_after)
-68        else:
-69            retry_date_tuple = parsedate_tz(retry_after)
-70            if retry_date_tuple is None:
-71                return None
-72            retry_date = mktime_tz(retry_date_tuple)
-73            seconds = retry_date - time.time()
-74        return max(self.min, min(self.max, seconds))
-75
-76    def _get_retry_after(self, retry_state: RetryCallState) -> Optional[float]:
-77        response = _get_retry_response(retry_state)
-78        if response is None:
-79            return None
-80        header = response.headers.get("Retry-After")
-81        if not header:
-82            return None
-83        return self._parse_retry_after(header)
-84
-85    def __call__(self, retry_state: RetryCallState) -> float:
-86        retry_after = self._get_retry_after(retry_state)
-87        if retry_after is not None:
-88            return retry_after
-89        return super().__call__(retry_state)
-
- - -

Wait strategy that applies exponential backoff.

- -

It allows for a customized multiplier and an ability to restrict the -upper and lower limits to some maximum and minimum value.

- -

The intervals are fixed (i.e. there is no jitter), so this strategy is -suitable for balancing retries against latency when a required resource is -unavailable for an unknown duration, but not suitable for resolving -contention between multiple processes for a shared resource. Use -wait_random_exponential for the latter case.

-
- - -
-
Inherited Members
-
-
tenacity.wait.wait_exponential
-
wait_exponential
-
multiplier
-
min
-
max
-
exp_base
- -
-
-
-
-
- -
- - class - Client: - - - -
- -
117class Client:
-118    """Wrapper for `requests` to create a `Session` with configurable retry functionality.
-119
-120    ### Summary
-121    Create a  `requests.Session` which automatically retries requests in case of error.
-122    By default retries are triggered for `5xx` and `429` status codes and when the server is unreachable or drops connection.
-123
-124    ### Custom retry condition
-125    You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception.
-126    For example, this will trigger a retry when the response text is `error`:
-127
-128    >>> from typing import Optional
-129    >>> from requests import Response
-130    >>>
-131    >>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool:
-132    >>>     if response is None:
-133    >>>         return False
-134    >>>     return response.text == 'error'
-135
-136    The retry is triggered when either any of the predicates or the default conditions based on status code/exception are `True`.
-137
-138    ### Args:
-139        request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
-140        max_connections: Max connections per host in the HTTPAdapter pool
-141        raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
-142        session: Optional `requests.Session` instance to add the retry handler to. A new session is created by default.
-143        status_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status.
-144        exceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions.
-145        request_max_attempts: Max number of retry attempts before giving up
-146        retry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried
-147        request_backoff_factor: Multiplier used for exponential delay between retries
-148        request_max_retry_delay: Maximum delay when using exponential backoff
-149        respect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay
-150        session_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes)
-151    """
-152    _session_attrs: Dict[str, Any]
-153
-154    @with_config(spec=RunConfiguration)
-155    def __init__(
-156        self,
-157        request_timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
-158        max_connections: int = 50,
-159        raise_for_status: bool = True,
-160        status_codes: Sequence[int] = DEFAULT_RETRY_STATUS,
-161        exceptions: Sequence[Type[Exception]] = DEFAULT_RETRY_EXCEPTIONS,
-162        request_max_attempts: int = RunConfiguration.request_max_attempts,
-163        retry_condition: Union[RetryPredicate, Sequence[RetryPredicate], None] = None,
-164        request_backoff_factor: float = RunConfiguration.request_backoff_factor,
-165        request_max_retry_delay: TimedeltaSeconds = RunConfiguration.request_max_retry_delay,
-166        respect_retry_after_header: bool = True,
-167        session_attrs: Optional[Dict[str, Any]] = None,
-168    ) -> None:
-169        self._adapter = HTTPAdapter(pool_maxsize=max_connections)
-170        self._local = local()
-171        self._session_kwargs = dict(timeout=request_timeout, raise_for_status=raise_for_status)
-172        self._retry_kwargs: Dict[str, Any] = dict(
-173            status_codes=status_codes,
-174            exceptions=exceptions,
-175            max_attempts=request_max_attempts,
-176            condition=retry_condition,
-177            backoff_factor=request_backoff_factor,
-178            respect_retry_after_header=respect_retry_after_header,
-179            max_delay=request_max_retry_delay
-180        )
-181        self._session_attrs = session_attrs or {}
-182
-183        if TYPE_CHECKING:
-184            self.get = self.session.get
-185            self.post = self.session.post
-186            self.put = self.session.put
-187            self.patch = self.session.patch
-188            self.delete = self.session.delete
-189            self.head = self.session.head
-190            self.options = self.session.options
-191            self.request = self.session.request
-192
-193        self.get = lambda *a, **kw: self.session.get(*a, **kw)
-194        self.post = lambda *a, **kw: self.session.post(*a, **kw)
-195        self.put = lambda *a, **kw: self.session.put(*a, **kw)
-196        self.patch = lambda *a, **kw: self.session.patch(*a, **kw)
-197        self.delete = lambda *a, **kw: self.session.delete(*a, **kw)
-198        self.head = lambda *a, **kw: self.session.head(*a, **kw)
-199        self.options = lambda *a, **kw: self.session.options(*a, **kw)
-200        self.request = lambda *a, **kw: self.session.request(*a, **kw)
-201
-202        self._config_version: int = 0  # Incrementing marker to ensure per-thread sessions are recreated on config changes
-203
-204    def update_from_config(self, config: RunConfiguration) -> None:
-205        """Update session/retry settings from RunConfiguration"""
-206        self._session_kwargs['timeout'] = config.request_timeout
-207        self._retry_kwargs['backoff_factor'] = config.request_backoff_factor
-208        self._retry_kwargs['max_delay'] = config.request_max_retry_delay
-209        self._retry_kwargs['max_attempts'] = config.request_max_attempts
-210        self._config_version += 1
-211
-212    def _make_session(self) -> Session:
-213        session = Session(**self._session_kwargs)  # type: ignore[arg-type]
-214        for key, value in self._session_attrs.items():
-215            setattr(session, key, value)
-216        session.mount('http://', self._adapter)
-217        session.mount('https://', self._adapter)
-218        retry = _make_retry(**self._retry_kwargs)
-219        session.request = retry.wraps(session.request)  # type: ignore[method-assign]
-220        return session
-221
-222    @property
-223    def session(self) -> Session:
-224        session: Optional[Session] = getattr(self._local, 'session', None)
-225        version = self._config_version
-226        if session is not None:
-227            version = self._local.config_version
-228        if session is None or version != self._config_version:
-229            # Create a new session if config has changed
-230            session = self._local.session = self._make_session()
-231            self._local.config_version = self._config_version
-232        return session
-
- - -

Wrapper for requests to create a Session with configurable retry functionality.

- -

Summary

- -

Create a requests.Session which automatically retries requests in case of error. -By default retries are triggered for 5xx and 429 status codes and when the server is unreachable or drops connection.

- -

Custom retry condition

- -

You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception. -For example, this will trigger a retry when the response text is error:

- -
-
>>> from typing import Optional
->>> from requests import Response
->>>
->>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool:
->>>     if response is None:
->>>         return False
->>>     return response.text == 'error'
-
-
- -

The retry is triggered when either any of the predicates or the default conditions based on status code/exception are True.

- -

Args:

- -
request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
-max_connections: Max connections per host in the HTTPAdapter pool
-raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
-session: Optional `requests.Session` instance to add the retry handler to. A new session is created by default.
-status_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status.
-exceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions.
-request_max_attempts: Max number of retry attempts before giving up
-retry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried
-request_backoff_factor: Multiplier used for exponential delay between retries
-request_max_retry_delay: Maximum delay when using exponential backoff
-respect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay
-session_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes)
-
-
- - -
- -
-
@with_config(spec=RunConfiguration)
- - Client( request_timeout: Union[int, float, datetime.timedelta, Tuple[Union[int, float, datetime.timedelta], Union[int, float, datetime.timedelta]], NoneType] = 60, max_connections: int = 50, raise_for_status: bool = True, status_codes: Sequence[int] = (429, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599), exceptions: Sequence[Type[Exception]] = (<class 'requests.exceptions.ConnectionError'>, <class 'requests.exceptions.Timeout'>, <class 'requests.exceptions.ChunkedEncodingError'>), request_max_attempts: int = 5, retry_condition: Union[Callable[[Optional[requests.models.Response], Optional[BaseException]], bool], Sequence[Callable[[Optional[requests.models.Response], Optional[BaseException]], bool]], NoneType] = None, request_backoff_factor: float = 1, request_max_retry_delay: Union[int, float, datetime.timedelta] = 300, respect_retry_after_header: bool = True, session_attrs: Optional[Dict[str, Any]] = None) - - - -
- -
154    @with_config(spec=RunConfiguration)
-155    def __init__(
-156        self,
-157        request_timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
-158        max_connections: int = 50,
-159        raise_for_status: bool = True,
-160        status_codes: Sequence[int] = DEFAULT_RETRY_STATUS,
-161        exceptions: Sequence[Type[Exception]] = DEFAULT_RETRY_EXCEPTIONS,
-162        request_max_attempts: int = RunConfiguration.request_max_attempts,
-163        retry_condition: Union[RetryPredicate, Sequence[RetryPredicate], None] = None,
-164        request_backoff_factor: float = RunConfiguration.request_backoff_factor,
-165        request_max_retry_delay: TimedeltaSeconds = RunConfiguration.request_max_retry_delay,
-166        respect_retry_after_header: bool = True,
-167        session_attrs: Optional[Dict[str, Any]] = None,
-168    ) -> None:
-169        self._adapter = HTTPAdapter(pool_maxsize=max_connections)
-170        self._local = local()
-171        self._session_kwargs = dict(timeout=request_timeout, raise_for_status=raise_for_status)
-172        self._retry_kwargs: Dict[str, Any] = dict(
-173            status_codes=status_codes,
-174            exceptions=exceptions,
-175            max_attempts=request_max_attempts,
-176            condition=retry_condition,
-177            backoff_factor=request_backoff_factor,
-178            respect_retry_after_header=respect_retry_after_header,
-179            max_delay=request_max_retry_delay
-180        )
-181        self._session_attrs = session_attrs or {}
-182
-183        if TYPE_CHECKING:
-184            self.get = self.session.get
-185            self.post = self.session.post
-186            self.put = self.session.put
-187            self.patch = self.session.patch
-188            self.delete = self.session.delete
-189            self.head = self.session.head
-190            self.options = self.session.options
-191            self.request = self.session.request
-192
-193        self.get = lambda *a, **kw: self.session.get(*a, **kw)
-194        self.post = lambda *a, **kw: self.session.post(*a, **kw)
-195        self.put = lambda *a, **kw: self.session.put(*a, **kw)
-196        self.patch = lambda *a, **kw: self.session.patch(*a, **kw)
-197        self.delete = lambda *a, **kw: self.session.delete(*a, **kw)
-198        self.head = lambda *a, **kw: self.session.head(*a, **kw)
-199        self.options = lambda *a, **kw: self.session.options(*a, **kw)
-200        self.request = lambda *a, **kw: self.session.request(*a, **kw)
-201
-202        self._config_version: int = 0  # Incrementing marker to ensure per-thread sessions are recreated on config changes
-
- - - - -
-
-
- get - - -
- - - - -
-
-
- post - - -
- - - - -
-
-
- put - - -
- - - - -
-
-
- patch - - -
- - - - -
-
-
- delete - - -
- - - - -
-
-
- head - - -
- - - - -
-
-
- options - - -
- - - - -
-
-
- request - - -
- - - - -
-
- -
- - def - update_from_config( self, config: dlt.common.configuration.specs.run_configuration.RunConfiguration) -> None: - - - -
- -
204    def update_from_config(self, config: RunConfiguration) -> None:
-205        """Update session/retry settings from RunConfiguration"""
-206        self._session_kwargs['timeout'] = config.request_timeout
-207        self._retry_kwargs['backoff_factor'] = config.request_backoff_factor
-208        self._retry_kwargs['max_delay'] = config.request_max_retry_delay
-209        self._retry_kwargs['max_attempts'] = config.request_max_attempts
-210        self._config_version += 1
-
- - -

Update session/retry settings from RunConfiguration

-
- - -
-
- - - - - -
-
-
- - \ No newline at end of file diff --git a/docs/website/static/api_reference/dlt/sources/helpers/requests/session.html b/docs/website/static/api_reference/dlt/sources/helpers/requests/session.html deleted file mode 100644 index 77e66d3a58..0000000000 --- a/docs/website/static/api_reference/dlt/sources/helpers/requests/session.html +++ /dev/null @@ -1,534 +0,0 @@ - - - - - - - dlt.sources.helpers.requests.session API documentation - - - - - - - - - -
-
-

-dlt.sources.helpers.requests.session

- - - - - - -
 1from requests import Session as BaseSession
- 2from tenacity import Retrying, retry_if_exception_type
- 3from typing import Optional, TYPE_CHECKING, Sequence, Union, Tuple, Type, TypeVar
- 4
- 5from dlt.sources.helpers.requests.typing import TRequestTimeout
- 6from dlt.common.typing import TimedeltaSeconds
- 7from dlt.common.time import to_seconds
- 8from dlt.version import __version__
- 9
-10
-11TSession = TypeVar("TSession", bound=BaseSession)
-12
-13
-14DEFAULT_TIMEOUT = 60
-15
-16
-17def _timeout_to_seconds(timeout: TRequestTimeout) -> Optional[Union[Tuple[float, float], float]]:
-18    return (to_seconds(timeout[0]), to_seconds(timeout[1])) if isinstance(timeout, tuple) else to_seconds(timeout)
-19
-20
-21class Session(BaseSession):
-22    """Requests session which by default adds a timeout to all requests and calls `raise_for_status()` on response
-23
-24    ### Args
-25        timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
-26            May be a single value or a tuple for separate (connect, read) timeout.
-27        raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
-28    """
-29    def __init__(
-30        self,
-31        timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
-32        raise_for_status: bool = True,
-33    ) -> None:
-34        super().__init__()
-35        self.timeout = _timeout_to_seconds(timeout)
-36        self.raise_for_status = raise_for_status
-37        self.headers.update({
-38            "User-Agent": f"dlt/{__version__}",
-39        })
-40
-41    if TYPE_CHECKING:
-42        request = BaseSession.request
-43
-44    def request(self, *args, **kwargs):  # type: ignore[no-untyped-def,no-redef]
-45        kwargs.setdefault('timeout', self.timeout)
-46        resp = super().request(*args, **kwargs)
-47        if self.raise_for_status:
-48            resp.raise_for_status()
-49        return resp
-
- - -
-
-
- DEFAULT_TIMEOUT = -60 - - -
- - - - -
-
- -
- - class - Session(requests.sessions.Session): - - - -
- -
22class Session(BaseSession):
-23    """Requests session which by default adds a timeout to all requests and calls `raise_for_status()` on response
-24
-25    ### Args
-26        timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
-27            May be a single value or a tuple for separate (connect, read) timeout.
-28        raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
-29    """
-30    def __init__(
-31        self,
-32        timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
-33        raise_for_status: bool = True,
-34    ) -> None:
-35        super().__init__()
-36        self.timeout = _timeout_to_seconds(timeout)
-37        self.raise_for_status = raise_for_status
-38        self.headers.update({
-39            "User-Agent": f"dlt/{__version__}",
-40        })
-41
-42    if TYPE_CHECKING:
-43        request = BaseSession.request
-44
-45    def request(self, *args, **kwargs):  # type: ignore[no-untyped-def,no-redef]
-46        kwargs.setdefault('timeout', self.timeout)
-47        resp = super().request(*args, **kwargs)
-48        if self.raise_for_status:
-49            resp.raise_for_status()
-50        return resp
-
- - -

Requests session which by default adds a timeout to all requests and calls raise_for_status() on response

- -

Args

- -
timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.
-    May be a single value or a tuple for separate (connect, read) timeout.
-raise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)
-
-
- - -
- -
- - Session( timeout: Union[int, float, datetime.timedelta, Tuple[Union[int, float, datetime.timedelta], Union[int, float, datetime.timedelta]], NoneType] = 60, raise_for_status: bool = True) - - - -
- -
30    def __init__(
-31        self,
-32        timeout: Optional[Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]] = DEFAULT_TIMEOUT,
-33        raise_for_status: bool = True,
-34    ) -> None:
-35        super().__init__()
-36        self.timeout = _timeout_to_seconds(timeout)
-37        self.raise_for_status = raise_for_status
-38        self.headers.update({
-39            "User-Agent": f"dlt/{__version__}",
-40        })
-
- - - - -
-
-
- timeout - - -
- - - - -
-
-
- raise_for_status - - -
- - - - -
-
- -
- - def - request(self, *args, **kwargs): - - - -
- -
45    def request(self, *args, **kwargs):  # type: ignore[no-untyped-def,no-redef]
-46        kwargs.setdefault('timeout', self.timeout)
-47        resp = super().request(*args, **kwargs)
-48        if self.raise_for_status:
-49            resp.raise_for_status()
-50        return resp
-
- - -

Constructs a Request <Request>, prepares it and sends it. -Returns Response <Response> object.

- -
Parameters
- -
    -
  • method: method for the new Request object.
  • -
  • url: URL for the new Request object.
  • -
  • params: (optional) Dictionary or bytes to be sent in the query -string for the Request.
  • -
  • data: (optional) Dictionary, list of tuples, bytes, or file-like -object to send in the body of the Request.
  • -
  • json: (optional) json to send in the body of the -Request.
  • -
  • headers: (optional) Dictionary of HTTP Headers to send with the -Request.
  • -
  • cookies: (optional) Dict or CookieJar object to send with the -Request.
  • -
  • **files: (optional) Dictionary of 'filename'**: file-like-objects -for multipart encoding upload.
  • -
  • auth: (optional) Auth tuple or callable to enable -Basic/Digest/Custom HTTP Auth.
  • -
  • timeout: (optional) How long to wait for the server to send -data before giving up, as a float, or a :ref:(connect timeout, -read timeout) <timeouts> tuple.
  • -
  • allow_redirects: (optional) Set to True by default.
  • -
  • proxies: (optional) Dictionary mapping protocol or protocol and -hostname to the URL of the proxy.
  • -
  • stream: (optional) whether to immediately download the response -content. Defaults to False.
  • -
  • verify: (optional) Either a boolean, in which case it controls whether we verify -the server's TLS certificate, or a string, in which case it must be a path -to a CA bundle to use. Defaults to True. When set to -False, requests will accept any TLS certificate presented by -the server, and will ignore hostname mismatches and/or expired -certificates, which will make your application vulnerable to -man-in-the-middle (MitM) attacks. Setting verify to False -may be useful during local development or testing.
  • -
  • cert: (optional) if String, path to ssl client cert file (.pem). -If Tuple, ('cert', 'key') pair.
  • -
-
- - -
-
-
Inherited Members
-
-
requests.sessions.Session
-
headers
-
auth
-
proxies
-
hooks
-
params
-
stream
-
verify
-
cert
-
max_redirects
-
trust_env
-
cookies
-
adapters
-
prepare_request
-
get
-
options
-
head
-
post
-
put
-
patch
-
delete
-
send
-
merge_environment_settings
-
get_adapter
-
close
-
mount
- -
-
requests.sessions.SessionRedirectMixin
-
get_redirect_target
-
should_strip_auth
-
resolve_redirects
-
rebuild_auth
-
rebuild_proxies
-
rebuild_method
- -
-
-
-
-
- - \ No newline at end of file diff --git a/docs/website/static/api_reference/dlt/sources/helpers/requests/typing.html b/docs/website/static/api_reference/dlt/sources/helpers/requests/typing.html deleted file mode 100644 index 3eb2b78aca..0000000000 --- a/docs/website/static/api_reference/dlt/sources/helpers/requests/typing.html +++ /dev/null @@ -1,263 +0,0 @@ - - - - - - - dlt.sources.helpers.requests.typing API documentation - - - - - - - - - -
-
-

-dlt.sources.helpers.requests.typing

- - - - - - -
1from typing import Tuple, Union, Optional
-2
-3from dlt.common.typing import TimedeltaSeconds
-4
-5# Either a single timeout or tuple (connect,read) timeout
-6TRequestTimeout = Union[TimedeltaSeconds, Tuple[TimedeltaSeconds, TimedeltaSeconds]]
-
- - -
-
-
- TRequestTimeout = - - typing.Union[int, float, datetime.timedelta, typing.Tuple[typing.Union[int, float, datetime.timedelta], typing.Union[int, float, datetime.timedelta]]] - - -
- - - - -
-
- - \ No newline at end of file diff --git a/docs/website/static/api_reference/dlt/sources/helpers/transform.html b/docs/website/static/api_reference/dlt/sources/helpers/transform.html deleted file mode 100644 index ab0d953f36..0000000000 --- a/docs/website/static/api_reference/dlt/sources/helpers/transform.html +++ /dev/null @@ -1,323 +0,0 @@ - - - - - - - dlt.sources.helpers.transform API documentation - - - - - - - - - -
-
-

-dlt.sources.helpers.transform

- - - - - - -
 1from dlt.common.typing import TDataItem
- 2from dlt.extract.typing import ItemTransformFunctionNoMeta
- 3
- 4
- 5def take_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]:
- 6    """A filter that takes only first `max_items` from a resource"""
- 7    count: int = 0
- 8    def _filter(_: TDataItem) -> bool:
- 9        nonlocal count
-10        count += 1
-11        return count <= max_items
-12    return _filter
-13
-14
-15def skip_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]:
-16    """A filter that skips first `max_items` from a resource"""
-17    count: int = 0
-18    def _filter(_: TDataItem) -> bool:
-19        nonlocal count
-20        count += 1
-21        return count > max_items
-22    return _filter
-
- - -
-
- -
- - def - take_first(max_items: int) -> Callable[[Any], bool]: - - - -
- -
 6def take_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]:
- 7    """A filter that takes only first `max_items` from a resource"""
- 8    count: int = 0
- 9    def _filter(_: TDataItem) -> bool:
-10        nonlocal count
-11        count += 1
-12        return count <= max_items
-13    return _filter
-
- - -

A filter that takes only first max_items from a resource

-
- - -
-
- -
- - def - skip_first(max_items: int) -> Callable[[Any], bool]: - - - -
- -
16def skip_first(max_items: int) -> ItemTransformFunctionNoMeta[bool]:
-17    """A filter that skips first `max_items` from a resource"""
-18    count: int = 0
-19    def _filter(_: TDataItem) -> bool:
-20        nonlocal count
-21        count += 1
-22        return count > max_items
-23    return _filter
-
- - -

A filter that skips first max_items from a resource

-
- - -
-
- - \ No newline at end of file diff --git a/docs/website/static/api_reference/index.html b/docs/website/static/api_reference/index.html deleted file mode 100644 index a7829f45b5..0000000000 --- a/docs/website/static/api_reference/index.html +++ /dev/null @@ -1,7 +0,0 @@ - - - - - - - diff --git a/docs/website/static/api_reference/search.js b/docs/website/static/api_reference/search.js deleted file mode 100644 index a8a8b9f84d..0000000000 --- a/docs/website/static/api_reference/search.js +++ /dev/null @@ -1,46 +0,0 @@ -window.pdocSearch = (function(){ -/** elasticlunr - http://weixsong.github.io * Copyright (C) 2017 Oliver Nightingale * Copyright (C) 2017 Wei Song * MIT Licensed */!function(){function e(e){if(null===e||"object"!=typeof e)return e;var t=e.constructor();for(var n in e)e.hasOwnProperty(n)&&(t[n]=e[n]);return t}var t=function(e){var n=new t.Index;return n.pipeline.add(t.trimmer,t.stopWordFilter,t.stemmer),e&&e.call(n,n),n};t.version="0.9.5",lunr=t,t.utils={},t.utils.warn=function(e){return function(t){e.console&&console.warn&&console.warn(t)}}(this),t.utils.toString=function(e){return void 0===e||null===e?"":e.toString()},t.EventEmitter=function(){this.events={}},t.EventEmitter.prototype.addListener=function(){var e=Array.prototype.slice.call(arguments),t=e.pop(),n=e;if("function"!=typeof t)throw new TypeError("last argument must be a function");n.forEach(function(e){this.hasHandler(e)||(this.events[e]=[]),this.events[e].push(t)},this)},t.EventEmitter.prototype.removeListener=function(e,t){if(this.hasHandler(e)){var n=this.events[e].indexOf(t);-1!==n&&(this.events[e].splice(n,1),0==this.events[e].length&&delete this.events[e])}},t.EventEmitter.prototype.emit=function(e){if(this.hasHandler(e)){var t=Array.prototype.slice.call(arguments,1);this.events[e].forEach(function(e){e.apply(void 0,t)},this)}},t.EventEmitter.prototype.hasHandler=function(e){return e in this.events},t.tokenizer=function(e){if(!arguments.length||null===e||void 0===e)return[];if(Array.isArray(e)){var n=e.filter(function(e){return null===e||void 0===e?!1:!0});n=n.map(function(e){return t.utils.toString(e).toLowerCase()});var i=[];return n.forEach(function(e){var n=e.split(t.tokenizer.seperator);i=i.concat(n)},this),i}return e.toString().trim().toLowerCase().split(t.tokenizer.seperator)},t.tokenizer.defaultSeperator=/[\s\-]+/,t.tokenizer.seperator=t.tokenizer.defaultSeperator,t.tokenizer.setSeperator=function(e){null!==e&&void 0!==e&&"object"==typeof e&&(t.tokenizer.seperator=e)},t.tokenizer.resetSeperator=function(){t.tokenizer.seperator=t.tokenizer.defaultSeperator},t.tokenizer.getSeperator=function(){return t.tokenizer.seperator},t.Pipeline=function(){this._queue=[]},t.Pipeline.registeredFunctions={},t.Pipeline.registerFunction=function(e,n){n in t.Pipeline.registeredFunctions&&t.utils.warn("Overwriting existing registered function: "+n),e.label=n,t.Pipeline.registeredFunctions[n]=e},t.Pipeline.getRegisteredFunction=function(e){return e in t.Pipeline.registeredFunctions!=!0?null:t.Pipeline.registeredFunctions[e]},t.Pipeline.warnIfFunctionNotRegistered=function(e){var n=e.label&&e.label in this.registeredFunctions;n||t.utils.warn("Function is not registered with pipeline. This may cause problems when serialising the index.\n",e)},t.Pipeline.load=function(e){var n=new t.Pipeline;return e.forEach(function(e){var i=t.Pipeline.getRegisteredFunction(e);if(!i)throw new Error("Cannot load un-registered function: "+e);n.add(i)}),n},t.Pipeline.prototype.add=function(){var e=Array.prototype.slice.call(arguments);e.forEach(function(e){t.Pipeline.warnIfFunctionNotRegistered(e),this._queue.push(e)},this)},t.Pipeline.prototype.after=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i+1,0,n)},t.Pipeline.prototype.before=function(e,n){t.Pipeline.warnIfFunctionNotRegistered(n);var i=this._queue.indexOf(e);if(-1===i)throw new Error("Cannot find existingFn");this._queue.splice(i,0,n)},t.Pipeline.prototype.remove=function(e){var t=this._queue.indexOf(e);-1!==t&&this._queue.splice(t,1)},t.Pipeline.prototype.run=function(e){for(var t=[],n=e.length,i=this._queue.length,o=0;n>o;o++){for(var r=e[o],s=0;i>s&&(r=this._queue[s](r,o,e),void 0!==r&&null!==r);s++);void 0!==r&&null!==r&&t.push(r)}return t},t.Pipeline.prototype.reset=function(){this._queue=[]},t.Pipeline.prototype.get=function(){return this._queue},t.Pipeline.prototype.toJSON=function(){return this._queue.map(function(e){return t.Pipeline.warnIfFunctionNotRegistered(e),e.label})},t.Index=function(){this._fields=[],this._ref="id",this.pipeline=new t.Pipeline,this.documentStore=new t.DocumentStore,this.index={},this.eventEmitter=new t.EventEmitter,this._idfCache={},this.on("add","remove","update",function(){this._idfCache={}}.bind(this))},t.Index.prototype.on=function(){var e=Array.prototype.slice.call(arguments);return this.eventEmitter.addListener.apply(this.eventEmitter,e)},t.Index.prototype.off=function(e,t){return this.eventEmitter.removeListener(e,t)},t.Index.load=function(e){e.version!==t.version&&t.utils.warn("version mismatch: current "+t.version+" importing "+e.version);var n=new this;n._fields=e.fields,n._ref=e.ref,n.documentStore=t.DocumentStore.load(e.documentStore),n.pipeline=t.Pipeline.load(e.pipeline),n.index={};for(var i in e.index)n.index[i]=t.InvertedIndex.load(e.index[i]);return n},t.Index.prototype.addField=function(e){return this._fields.push(e),this.index[e]=new t.InvertedIndex,this},t.Index.prototype.setRef=function(e){return this._ref=e,this},t.Index.prototype.saveDocument=function(e){return this.documentStore=new t.DocumentStore(e),this},t.Index.prototype.addDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.addDoc(i,e),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));this.documentStore.addFieldLength(i,n,o.length);var r={};o.forEach(function(e){e in r?r[e]+=1:r[e]=1},this);for(var s in r){var u=r[s];u=Math.sqrt(u),this.index[n].addToken(s,{ref:i,tf:u})}},this),n&&this.eventEmitter.emit("add",e,this)}},t.Index.prototype.removeDocByRef=function(e){if(e&&this.documentStore.isDocStored()!==!1&&this.documentStore.hasDoc(e)){var t=this.documentStore.getDoc(e);this.removeDoc(t,!1)}},t.Index.prototype.removeDoc=function(e,n){if(e){var n=void 0===n?!0:n,i=e[this._ref];this.documentStore.hasDoc(i)&&(this.documentStore.removeDoc(i),this._fields.forEach(function(n){var o=this.pipeline.run(t.tokenizer(e[n]));o.forEach(function(e){this.index[n].removeToken(e,i)},this)},this),n&&this.eventEmitter.emit("remove",e,this))}},t.Index.prototype.updateDoc=function(e,t){var t=void 0===t?!0:t;this.removeDocByRef(e[this._ref],!1),this.addDoc(e,!1),t&&this.eventEmitter.emit("update",e,this)},t.Index.prototype.idf=function(e,t){var n="@"+t+"/"+e;if(Object.prototype.hasOwnProperty.call(this._idfCache,n))return this._idfCache[n];var i=this.index[t].getDocFreq(e),o=1+Math.log(this.documentStore.length/(i+1));return this._idfCache[n]=o,o},t.Index.prototype.getFields=function(){return this._fields.slice()},t.Index.prototype.search=function(e,n){if(!e)return[];e="string"==typeof e?{any:e}:JSON.parse(JSON.stringify(e));var i=null;null!=n&&(i=JSON.stringify(n));for(var o=new t.Configuration(i,this.getFields()).get(),r={},s=Object.keys(e),u=0;u0&&t.push(e);for(var i in n)"docs"!==i&&"df"!==i&&this.expandToken(e+i,t,n[i]);return t},t.InvertedIndex.prototype.toJSON=function(){return{root:this.root}},t.Configuration=function(e,n){var e=e||"";if(void 0==n||null==n)throw new Error("fields should not be null");this.config={};var i;try{i=JSON.parse(e),this.buildUserConfig(i,n)}catch(o){t.utils.warn("user configuration parse failed, will use default configuration"),this.buildDefaultConfig(n)}},t.Configuration.prototype.buildDefaultConfig=function(e){this.reset(),e.forEach(function(e){this.config[e]={boost:1,bool:"OR",expand:!1}},this)},t.Configuration.prototype.buildUserConfig=function(e,n){var i="OR",o=!1;if(this.reset(),"bool"in e&&(i=e.bool||i),"expand"in e&&(o=e.expand||o),"fields"in e)for(var r in e.fields)if(n.indexOf(r)>-1){var s=e.fields[r],u=o;void 0!=s.expand&&(u=s.expand),this.config[r]={boost:s.boost||0===s.boost?s.boost:1,bool:s.bool||i,expand:u}}else t.utils.warn("field name in user configuration not found in index instance fields");else this.addAllFields2UserConfig(i,o,n)},t.Configuration.prototype.addAllFields2UserConfig=function(e,t,n){n.forEach(function(n){this.config[n]={boost:1,bool:e,expand:t}},this)},t.Configuration.prototype.get=function(){return this.config},t.Configuration.prototype.reset=function(){this.config={}},lunr.SortedSet=function(){this.length=0,this.elements=[]},lunr.SortedSet.load=function(e){var t=new this;return t.elements=e,t.length=e.length,t},lunr.SortedSet.prototype.add=function(){var e,t;for(e=0;e1;){if(r===e)return o;e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o]}return r===e?o:-1},lunr.SortedSet.prototype.locationFor=function(e){for(var t=0,n=this.elements.length,i=n-t,o=t+Math.floor(i/2),r=this.elements[o];i>1;)e>r&&(t=o),r>e&&(n=o),i=n-t,o=t+Math.floor(i/2),r=this.elements[o];return r>e?o:e>r?o+1:void 0},lunr.SortedSet.prototype.intersect=function(e){for(var t=new lunr.SortedSet,n=0,i=0,o=this.length,r=e.length,s=this.elements,u=e.elements;;){if(n>o-1||i>r-1)break;s[n]!==u[i]?s[n]u[i]&&i++:(t.add(s[n]),n++,i++)}return t},lunr.SortedSet.prototype.clone=function(){var e=new lunr.SortedSet;return e.elements=this.toArray(),e.length=e.elements.length,e},lunr.SortedSet.prototype.union=function(e){var t,n,i;this.length>=e.length?(t=this,n=e):(t=e,n=this),i=t.clone();for(var o=0,r=n.toArray();o

\n"}, {"fullname": "dlt.sources", "modulename": "dlt.sources", "kind": "module", "doc": "

Module with built in sources and source building blocks

\n"}, {"fullname": "dlt.sources.helpers", "modulename": "dlt.sources.helpers", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests", "modulename": "dlt.sources.helpers.requests", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.client", "modulename": "dlt.sources.helpers.requests", "qualname": "client", "kind": "variable", "doc": "

\n", "default_value": "<dlt.sources.helpers.requests.retry.Client object>"}, {"fullname": "dlt.sources.helpers.requests.init", "modulename": "dlt.sources.helpers.requests", "qualname": "init", "kind": "function", "doc": "

Initialize the default requests client from config

\n", "signature": "(\tconfig: dlt.common.configuration.specs.run_configuration.RunConfiguration) -> None:", "funcdef": "def"}, {"fullname": "dlt.sources.helpers.requests.retry", "modulename": "dlt.sources.helpers.requests.retry", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.DEFAULT_RETRY_STATUS", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "DEFAULT_RETRY_STATUS", "kind": "variable", "doc": "

\n", "default_value": "(429, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599)"}, {"fullname": "dlt.sources.helpers.requests.retry.DEFAULT_RETRY_EXCEPTIONS", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "DEFAULT_RETRY_EXCEPTIONS", "kind": "variable", "doc": "

\n", "default_value": "(<class 'requests.exceptions.ConnectionError'>, <class 'requests.exceptions.Timeout'>, <class 'requests.exceptions.ChunkedEncodingError'>)"}, {"fullname": "dlt.sources.helpers.requests.retry.RetryPredicate", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "RetryPredicate", "kind": "variable", "doc": "

\n", "default_value": "typing.Callable[[typing.Optional[requests.models.Response], typing.Optional[BaseException]], bool]"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_status", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_status", "kind": "class", "doc": "

Retry for given response status codes

\n", "bases": "tenacity.retry.retry_base"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_status.__init__", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_status.__init__", "kind": "function", "doc": "

\n", "signature": "(status_codes: Sequence[int])"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_status.status_codes", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_status.status_codes", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_predicate", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_predicate", "kind": "class", "doc": "

Abstract base class for retry strategies.

\n", "bases": "tenacity.retry.retry_base"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_predicate.__init__", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_predicate.__init__", "kind": "function", "doc": "

\n", "signature": "(\tpredicate: Callable[[Optional[requests.models.Response], Optional[BaseException]], bool])"}, {"fullname": "dlt.sources.helpers.requests.retry.retry_if_predicate.predicate", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "retry_if_predicate.predicate", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.wait_exponential_retry_after", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "wait_exponential_retry_after", "kind": "class", "doc": "

Wait strategy that applies exponential backoff.

\n\n

It allows for a customized multiplier and an ability to restrict the\nupper and lower limits to some maximum and minimum value.

\n\n

The intervals are fixed (i.e. there is no jitter), so this strategy is\nsuitable for balancing retries against latency when a required resource is\nunavailable for an unknown duration, but not suitable for resolving\ncontention between multiple processes for a shared resource. Use\nwait_random_exponential for the latter case.

\n", "bases": "tenacity.wait.wait_exponential"}, {"fullname": "dlt.sources.helpers.requests.retry.Client", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client", "kind": "class", "doc": "

Wrapper for requests to create a Session with configurable retry functionality.

\n\n

Summary

\n\n

Create a requests.Session which automatically retries requests in case of error.\nBy default retries are triggered for 5xx and 429 status codes and when the server is unreachable or drops connection.

\n\n

Custom retry condition

\n\n

You can provide one or more custom predicates for specific retry condition. The predicate is called after every request with the resulting response and/or exception.\nFor example, this will trigger a retry when the response text is error:

\n\n
\n
>>> from typing import Optional\n>>> from requests import Response\n>>>\n>>> def should_retry(response: Optional[Response], exception: Optional[BaseException]) -> bool:\n>>>     if response is None:\n>>>         return False\n>>>     return response.text == 'error'\n
\n
\n\n

The retry is triggered when either any of the predicates or the default conditions based on status code/exception are True.

\n\n

Args:

\n\n
request_timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.\nmax_connections: Max connections per host in the HTTPAdapter pool\nraise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)\nsession: Optional `requests.Session` instance to add the retry handler to. A new session is created by default.\nstatus_codes: Retry when response has any of these status codes. Default `429` and all `5xx` codes. Pass an empty list to disable retry based on status.\nexceptions: Retry on exception of given type(s). Default `(requests.Timeout, requests.ConnectionError)`. Pass an empty list to disable retry on exceptions.\nrequest_max_attempts: Max number of retry attempts before giving up\nretry_condition: A predicate or a list of predicates to decide whether to retry. If any predicate returns `True` the request is retried\nrequest_backoff_factor: Multiplier used for exponential delay between retries\nrequest_max_retry_delay: Maximum delay when using exponential backoff\nrespect_retry_after_header: Whether to use the `Retry-After` response header (when available) to determine the retry delay\nsession_attrs: Extra attributes that will be set on the session instance, e.g. `{headers: {'Authorization': 'api-key'}}` (see `requests.sessions.Session` for possible attributes)\n
\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.__init__", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.__init__", "kind": "function", "doc": "

\n", "signature": "(\trequest_timeout: Union[int, float, datetime.timedelta, Tuple[Union[int, float, datetime.timedelta], Union[int, float, datetime.timedelta]], NoneType] = 60,\tmax_connections: int = 50,\traise_for_status: bool = True,\tstatus_codes: Sequence[int] = (429, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599),\texceptions: Sequence[Type[Exception]] = (<class 'requests.exceptions.ConnectionError'>, <class 'requests.exceptions.Timeout'>, <class 'requests.exceptions.ChunkedEncodingError'>),\trequest_max_attempts: int = 5,\tretry_condition: Union[Callable[[Optional[requests.models.Response], Optional[BaseException]], bool], Sequence[Callable[[Optional[requests.models.Response], Optional[BaseException]], bool]], NoneType] = None,\trequest_backoff_factor: float = 1,\trequest_max_retry_delay: Union[int, float, datetime.timedelta] = 300,\trespect_retry_after_header: bool = True,\tsession_attrs: Optional[Dict[str, Any]] = None)"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.get", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.get", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.post", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.post", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.put", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.put", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.patch", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.patch", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.delete", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.delete", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.head", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.head", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.options", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.options", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.request", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.request", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.update_from_config", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.update_from_config", "kind": "function", "doc": "

Update session/retry settings from RunConfiguration

\n", "signature": "(\tself,\tconfig: dlt.common.configuration.specs.run_configuration.RunConfiguration) -> None:", "funcdef": "def"}, {"fullname": "dlt.sources.helpers.requests.retry.Client.session", "modulename": "dlt.sources.helpers.requests.retry", "qualname": "Client.session", "kind": "variable", "doc": "

\n", "annotation": ": dlt.sources.helpers.requests.session.Session"}, {"fullname": "dlt.sources.helpers.requests.session", "modulename": "dlt.sources.helpers.requests.session", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.session.DEFAULT_TIMEOUT", "modulename": "dlt.sources.helpers.requests.session", "qualname": "DEFAULT_TIMEOUT", "kind": "variable", "doc": "

\n", "default_value": "60"}, {"fullname": "dlt.sources.helpers.requests.session.Session", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session", "kind": "class", "doc": "

Requests session which by default adds a timeout to all requests and calls raise_for_status() on response

\n\n

Args

\n\n
timeout: Timeout for requests in seconds. May be passed as `timedelta` or `float/int` number of seconds.\n    May be a single value or a tuple for separate (connect, read) timeout.\nraise_for_status: Whether to raise exception on error status codes (using `response.raise_for_status()`)\n
\n", "bases": "requests.sessions.Session"}, {"fullname": "dlt.sources.helpers.requests.session.Session.__init__", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session.__init__", "kind": "function", "doc": "

\n", "signature": "(\ttimeout: Union[int, float, datetime.timedelta, Tuple[Union[int, float, datetime.timedelta], Union[int, float, datetime.timedelta]], NoneType] = 60,\traise_for_status: bool = True)"}, {"fullname": "dlt.sources.helpers.requests.session.Session.timeout", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session.timeout", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.session.Session.raise_for_status", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session.raise_for_status", "kind": "variable", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.session.Session.request", "modulename": "dlt.sources.helpers.requests.session", "qualname": "Session.request", "kind": "function", "doc": "

Constructs a Request <Request>, prepares it and sends it.\nReturns Response <Response> object.

\n\n
Parameters
\n\n
    \n
  • method: method for the new Request object.
  • \n
  • url: URL for the new Request object.
  • \n
  • params: (optional) Dictionary or bytes to be sent in the query\nstring for the Request.
  • \n
  • data: (optional) Dictionary, list of tuples, bytes, or file-like\nobject to send in the body of the Request.
  • \n
  • json: (optional) json to send in the body of the\nRequest.
  • \n
  • headers: (optional) Dictionary of HTTP Headers to send with the\nRequest.
  • \n
  • cookies: (optional) Dict or CookieJar object to send with the\nRequest.
  • \n
  • **files: (optional) Dictionary of 'filename'**: file-like-objects\nfor multipart encoding upload.
  • \n
  • auth: (optional) Auth tuple or callable to enable\nBasic/Digest/Custom HTTP Auth.
  • \n
  • timeout: (optional) How long to wait for the server to send\ndata before giving up, as a float, or a :ref:(connect timeout,\nread timeout) <timeouts> tuple.
  • \n
  • allow_redirects: (optional) Set to True by default.
  • \n
  • proxies: (optional) Dictionary mapping protocol or protocol and\nhostname to the URL of the proxy.
  • \n
  • stream: (optional) whether to immediately download the response\ncontent. Defaults to False.
  • \n
  • verify: (optional) Either a boolean, in which case it controls whether we verify\nthe server's TLS certificate, or a string, in which case it must be a path\nto a CA bundle to use. Defaults to True. When set to\nFalse, requests will accept any TLS certificate presented by\nthe server, and will ignore hostname mismatches and/or expired\ncertificates, which will make your application vulnerable to\nman-in-the-middle (MitM) attacks. Setting verify to False\nmay be useful during local development or testing.
  • \n
  • cert: (optional) if String, path to ssl client cert file (.pem).\nIf Tuple, ('cert', 'key') pair.
  • \n
\n", "signature": "(self, *args, **kwargs):", "funcdef": "def"}, {"fullname": "dlt.sources.helpers.requests.typing", "modulename": "dlt.sources.helpers.requests.typing", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.requests.typing.TRequestTimeout", "modulename": "dlt.sources.helpers.requests.typing", "qualname": "TRequestTimeout", "kind": "variable", "doc": "

\n", "default_value": "typing.Union[int, float, datetime.timedelta, typing.Tuple[typing.Union[int, float, datetime.timedelta], typing.Union[int, float, datetime.timedelta]]]"}, {"fullname": "dlt.sources.helpers.transform", "modulename": "dlt.sources.helpers.transform", "kind": "module", "doc": "

\n"}, {"fullname": "dlt.sources.helpers.transform.take_first", "modulename": "dlt.sources.helpers.transform", "qualname": "take_first", "kind": "function", "doc": "

A filter that takes only first max_items from a resource

\n", "signature": "(max_items: int) -> Callable[[Any], bool]:", "funcdef": "def"}, {"fullname": "dlt.sources.helpers.transform.skip_first", "modulename": "dlt.sources.helpers.transform", "qualname": "skip_first", "kind": "function", "doc": "

A filter that skips first max_items from a resource

\n", "signature": "(max_items: int) -> Callable[[Any], bool]:", "funcdef": "def"}]; - - // mirrored in build-search-index.js (part 1) - // Also split on html tags. this is a cheap heuristic, but good enough. - elasticlunr.tokenizer.setSeperator(/[\s\-.;&_'"=,()]+|<[^>]*>/); - - let searchIndex; - if (docs._isPrebuiltIndex) { - console.info("using precompiled search index"); - searchIndex = elasticlunr.Index.load(docs); - } else { - console.time("building search index"); - // mirrored in build-search-index.js (part 2) - searchIndex = elasticlunr(function () { - this.pipeline.remove(elasticlunr.stemmer); - this.pipeline.remove(elasticlunr.stopWordFilter); - this.addField("qualname"); - this.addField("fullname"); - this.addField("annotation"); - this.addField("default_value"); - this.addField("signature"); - this.addField("bases"); - this.addField("doc"); - this.setRef("fullname"); - }); - for (let doc of docs) { - searchIndex.addDoc(doc); - } - console.timeEnd("building search index"); - } - - return (term) => searchIndex.search(term, { - fields: { - qualname: {boost: 4}, - fullname: {boost: 2}, - annotation: {boost: 2}, - default_value: {boost: 2}, - signature: {boost: 2}, - bases: {boost: 2}, - doc: {boost: 1}, - }, - expand: true - }); -})(); \ No newline at end of file From 9f5421c810427e3a01df6541885c1717e92e9bb2 Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Wed, 13 Sep 2023 15:51:06 +0200 Subject: [PATCH 06/10] added markdown references for sphinx --- .../docs/api_ref_sphinx/markdown/api.md | 96 +++++++++++++++++++ .../docs/api_ref_sphinx/markdown/index.md | 19 ++++ 2 files changed, 115 insertions(+) create mode 100644 docs/website/docs/api_ref_sphinx/markdown/api.md create mode 100644 docs/website/docs/api_ref_sphinx/markdown/index.md diff --git a/docs/website/docs/api_ref_sphinx/markdown/api.md b/docs/website/docs/api_ref_sphinx/markdown/api.md new file mode 100644 index 0000000000..10af1453b3 --- /dev/null +++ b/docs/website/docs/api_ref_sphinx/markdown/api.md @@ -0,0 +1,96 @@ +# API + +### dlt.pipeline.attach(pipeline_name: str = None, pipelines_dir: str = None, pipeline_salt: ~dlt.common.typing.TSecretValue = None, full_refresh: bool = False, credentials: ~typing.Any = None, progress: ~dlt.common.runtime.collector.Collector | ~typing.Literal['tqdm', 'enlighten', 'log', 'alive_progress'] = , \*\*kwargs: ~typing.Any) + +Attaches to the working folder of pipeline_name in pipelines_dir or in default directory. Requires that valid pipeline state exists in working folder. + +### dlt.pipeline.pipeline(pipeline_name: str = None, pipelines_dir: str = None, pipeline_salt: TSecretValue = None, destination: TDestinationReferenceArg = None, staging: TDestinationReferenceArg = None, dataset_name: str = None, import_schema_path: str = None, export_schema_path: str = None, full_refresh: bool = False, credentials: Any = None, progress: TCollectorArg = \_NULL_COLLECTOR) + +### dlt.pipeline.pipeline() + +Creates a new instance of dlt pipeline, which moves the data from the source ie. a REST API to a destination ie. database or a data lake. + +Summary +: The pipeline functions allows you to pass the destination name to which the data should be loaded, the name of the dataset and several other options that govern loading of the data. + The created Pipeline object lets you load the data from any source with run method or to have more granular control over the loading process with extract, normalize and load methods. + +Please refer to the following doc pages +: - Write your first pipeline walkthrough: [https://dlthub.com/docs/walkthroughs/create-a-pipeline](https://dlthub.com/docs/walkthroughs/create-a-pipeline) + - Pipeline architecture and data loading steps: [https://dlthub.com/docs/reference](https://dlthub.com/docs/reference) + - List of supported destinations: [https://dlthub.com/docs/dlt-ecosystem/destinations](https://dlthub.com/docs/dlt-ecosystem/destinations) + +* **Parameters:** + * **pipeline_name** (*May also be provided later to the run* *or* *load methods* *of* *the Pipeline. If not provided at all then defaults to the*) – A name of the pipeline that will be used to identify it in monitoring events and to restore its state and data schemas on subsequent runs. + * **added.** (*Defaults to the file name* *of* *pipeline script with dlt_ prefix*) – + * **pipelines_dir** (*str**,* *optional*) – A working directory in which pipeline state and temporary files will be stored. Defaults to user home directory: ~/dlt/pipelines/. + * **pipeline_salt** (*TSecretValue**,* *optional*) – A random value used for deterministic hashing during data anonymization. Defaults to a value derived from the pipeline name. + * **purposes.** (*Default value should not be used for any cryptographic*) – + * **destination** (*str* *|* *DestinationReference**,* *optional*) – A name of the destination to which dlt will load the data, or a destination module imported from dlt.destination. + * **pipeline.** (*May also be provided to run method* *of* *the*) – + * **staging** (*str* *|* *DestinationReference**,* *optional*) – A name of the destination where dlt will stage the data before final loading, or a destination module imported from dlt.destination. + * **pipeline.** – + * **dataset_name** (*str**,* *optional*) – A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. schema in relational databases or folder grouping many files. + * **pipeline_name** – + * **import_schema_path** (*str**,* *optional*) – A path from which the schema yaml file will be imported on each pipeline run. Defaults to None which disables importing. + * **export_schema_path** (*str**,* *optional*) – A path where the schema yaml file will be exported after every schema change. Defaults to None which disables exporting. + * **full_refresh** (*bool**,* *optional*) – When set to True, each instance of the pipeline with the pipeline_name starts from scratch when run and loads the data to a separate dataset. + * **False.** (*The datasets are identified by dataset_name_ + datetime suffix. Use this setting whenever you experiment with your data to be sure you start fresh on each run. Defaults to*) – + * **credentials** (*Any**,* *optional*) – Credentials for the destination ie. database connection string or a dictionary with google cloud credentials. + * **None** (*In most cases should be set to*) – + * **values.** (*which lets dlt to use secrets.toml* *or* *environment variables to infer right credentials*) – + * **progress** (*str**,* *Collector*) – A progress monitor that shows progress bars, console or log messages with current information on sources, resources, data items etc. processed in + * **extract** – + * **module.** (*normalize and load stage. Pass a string with a collector name* *or* *configure your own by choosing from dlt.progress*) – + * **libraries** (*We support most* *of* *the progress*) – try passing tqdm, enlighten or alive_progress or log to write to console/log. +* **Returns:** + An instance of Pipeline class with. Please check the documentation of run method for information on what to do with it. +* **Return type:** + Pipeline + +### dlt.pipeline.run(data: Any, \*, destination: DestinationReference | module | None | str = None, staging: DestinationReference | module | None | str = None, dataset_name: str | None = None, credentials: Any | None = None, table_name: str | None = None, write_disposition: Literal['skip', 'append', 'replace', 'merge'] | None = None, columns: Sequence[TColumnSchema] | None = None, schema: Schema | None = None) + +Loads the data in data argument into the destination specified in destination and dataset specified in dataset_name. + +Summary +: This method will extract the data from the data argument, infer the schema, normalize the data into a load package (ie. jsonl or PARQUET files representing tables) and then load such packages into the destination. + +The data may be supplied in several forms: +: * a list or Iterable of any JSON-serializable objects ie. dlt.run([1, 2, 3], table_name=”numbers”) + * any Iterator or a function that yield (Generator) ie. dlt.run(range(1, 10), table_name=”range”) + * a function or a list of functions decorated with @dlt.resource ie. dlt.run([chess_players(title=”GM”), chess_games()]) + * a function or a list of functions decorated with @dlt.source. + +Please note that dlt deals with bytes, datetime, decimal and uuid objects so you are free to load binary data or documents containing dates. + +Execution +: The run method will first use sync_destination method to synchronize pipeline state and schemas with the destination. You can disable this behavior with restore_from_destination configuration option. + Next it will make sure that data from the previous is fully processed. If not, run method normalizes and loads pending data items. + Only then the new data from data argument is extracted, normalized and loaded. + +* **Parameters:** + * **data** (*The behavior* *of* *this argument depends on the type* *of* *the*) – Data to be loaded to destination + * **destination** (*str* *|* *DestinationReference**,* *optional*) – A name of the destination to which dlt will load the data, or a destination module imported from dlt.destination. + * **provided** (*If not*) – + * **used.** (*the value passed to dlt.pipeline will be*) – + * **dataset_name** (*str**,* *optional*) – A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. schema in relational databases or folder grouping many files. + * **provided** – + * **pipeline_name** (*the value passed to dlt.pipeline will be used. If not provided at all then defaults to the*) – + * **credentials** (*Any**,* *optional*) – Credentials for the destination ie. database connection string or a dictionary with google cloud credentials. + * **None** (*In most cases should be set to*) – + * **values.** (*which lets dlt to use secrets.toml* *or* *environment variables to infer right credentials*) – + * **table_name** (*str**,* *optional*) – The name of the table to which the data should be loaded within the dataset. This argument is required for a data that is a list/Iterable or Iterator without \_\_name_\_ attribute. + * **data** – + * **functions** (*\* generator*) – the function name is used as table name, table_name overrides this default + * **@dlt.resource** (*\**) – resource contains the full table schema and that includes the table name. table_name will override this property. Use with care! + * **@dlt.source** (*\**) – source contains several resources each with a table schema. table_name will override all table names within the source and load the data into single table. + * **write_disposition** (*Literal**[**"skip"**,* *"append"**,* *"replace"**,* *"merge"**]**,* *optional*) – Controls how to write data to a table. append will always add new data at the end of the table. replace will replace existing data with new data. skip will prevent data from loading. “merge” will deduplicate and merge data based on “primary_key” and “merge_key” hints. Defaults to “append”. + * **dlt.source** (*Please note that in case* *of* *dlt.resource the table schema value will be overwritten and in case of*) – + * **overwritten.** (*the values in all resources will be*) – + * **columns** (*Sequence**[**TColumnSchema**]**,* *optional*) – A list of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. + * **schema** (*Schema**,* *optional*) – An explicit Schema object in which all table schemas will be grouped. By default dlt takes the schema from the source (if passed in data argument) or creates a default one itself. +* **Raises:** + **PipelineStepFailed when a problem happened during extract****,** **normalize** **or** **load steps.** – +* **Returns:** + Information on loaded data including the list of package ids and failed job statuses. Please not that dlt will not raise if a single job terminally fails. Such information is provided via LoadInfo. +* **Return type:** + LoadInfo diff --git a/docs/website/docs/api_ref_sphinx/markdown/index.md b/docs/website/docs/api_ref_sphinx/markdown/index.md new file mode 100644 index 0000000000..c3b15484b4 --- /dev/null +++ b/docs/website/docs/api_ref_sphinx/markdown/index.md @@ -0,0 +1,19 @@ + + +# Welcome to api_reference’s documentation! + +# Contents: + +* [API](api.md) + * [`attach()`](api.md#dlt.pipeline.attach) + * [`pipeline()`](api.md#dlt.pipeline.pipeline) + * [`run()`](api.md#dlt.pipeline.run) + +# Indices and tables + +* [Index](genindex.md) +* [Module Index](py-modindex.md) +* [Search Page](search.md) From 4ec26f1f12ff16747327652aedba16657e605172 Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Wed, 13 Sep 2023 17:28:36 +0200 Subject: [PATCH 07/10] added markdown references for pydoc --- docs/website/docs/api_ref_pydoc/index.md | 249 ++++++++++++++++++ .../docs/api_ref_sphinx/markdown/api.md | 96 ------- .../docs/api_ref_sphinx/markdown/index.md | 19 -- docs/website/sidebars.js | 24 +- 4 files changed, 254 insertions(+), 134 deletions(-) create mode 100644 docs/website/docs/api_ref_pydoc/index.md delete mode 100644 docs/website/docs/api_ref_sphinx/markdown/api.md delete mode 100644 docs/website/docs/api_ref_sphinx/markdown/index.md diff --git a/docs/website/docs/api_ref_pydoc/index.md b/docs/website/docs/api_ref_pydoc/index.md new file mode 100644 index 0000000000..9bb885999f --- /dev/null +++ b/docs/website/docs/api_ref_pydoc/index.md @@ -0,0 +1,249 @@ +# Table of Contents + +* [dlt/pipeline](#dlt/pipeline) + * [pipeline](#dlt/pipeline.pipeline) + * [pipeline](#dlt/pipeline.pipeline) + * [pipeline](#dlt/pipeline.pipeline) + * [attach](#dlt/pipeline.attach) + * [run](#dlt/pipeline.run) + + + +# dlt/pipeline + + + +#### pipeline + +```python +@overload +def pipeline(pipeline_name: str = None, + pipelines_dir: str = None, + pipeline_salt: TSecretValue = None, + destination: TDestinationReferenceArg = None, + staging: TDestinationReferenceArg = None, + dataset_name: str = None, + import_schema_path: str = None, + export_schema_path: str = None, + full_refresh: bool = False, + credentials: Any = None, + progress: TCollectorArg = _NULL_COLLECTOR) -> Pipeline +``` + +Creates a new instance of `dlt` pipeline, which moves the data from the source ie. a REST API to a destination ie. database or a data lake. + +Summary +The `pipeline` functions allows you to pass the destination name to which the data should be loaded, the name of the dataset and several other options that govern loading of the data. +The created `Pipeline` object lets you load the data from any source with `run` method or to have more granular control over the loading process with `extract`, `normalize` and `load` methods. + +Please refer to the following doc pages +- Write your first pipeline walkthrough: https://dlthub.com/docs/walkthroughs/create-a-pipeline +- Pipeline architecture and data loading steps: https://dlthub.com/docs/reference +- List of supported destinations: https://dlthub.com/docs/dlt-ecosystem/destinations + +**Arguments**: + +- `pipeline_name` _str, optional_ - A name of the pipeline that will be used to identify it in monitoring events and to restore its state and data schemas on subsequent runs. + Defaults to the file name of pipeline script with `dlt_` prefix added. + +- `pipelines_dir` _str, optional_ - A working directory in which pipeline state and temporary files will be stored. Defaults to user home directory: `~/dlt/pipelines/`. + +- `pipeline_salt` _TSecretValue, optional_ - A random value used for deterministic hashing during data anonymization. Defaults to a value derived from the pipeline name. + Default value should not be used for any cryptographic purposes. + +- `destination` _str | DestinationReference, optional_ - A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`. + May also be provided to `run` method of the `pipeline`. + +- `staging` _str | DestinationReference, optional_ - A name of the destination where dlt will stage the data before final loading, or a destination module imported from `dlt.destination`. + May also be provided to `run` method of the `pipeline`. + +- `dataset_name` _str, optional_ - A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. `schema` in relational databases or folder grouping many files. + May also be provided later to the `run` or `load` methods of the `Pipeline`. If not provided at all then defaults to the `pipeline_name` + +- `import_schema_path` _str, optional_ - A path from which the schema `yaml` file will be imported on each pipeline run. Defaults to None which disables importing. + +- `export_schema_path` _str, optional_ - A path where the schema `yaml` file will be exported after every schema change. Defaults to None which disables exporting. + +- `full_refresh` _bool, optional_ - When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset. + The datasets are identified by `dataset_name_` + datetime suffix. Use this setting whenever you experiment with your data to be sure you start fresh on each run. Defaults to False. + +- `credentials` _Any, optional_ - Credentials for the `destination` ie. database connection string or a dictionary with google cloud credentials. + In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer right credentials values. + + progress(str, Collector): A progress monitor that shows progress bars, console or log messages with current information on sources, resources, data items etc. processed in + `extract`, `normalize` and `load` stage. Pass a string with a collector name or configure your own by choosing from `dlt.progress` module. + We support most of the progress libraries: try passing `tqdm`, `enlighten` or `alive_progress` or `log` to write to console/log. + + +**Returns**: + +- `Pipeline` - An instance of `Pipeline` class with. Please check the documentation of `run` method for information on what to do with it. + + + +#### pipeline + +```python +@overload +def pipeline() -> Pipeline +``` + +When called without any arguments, returns the recently created `Pipeline` instance. +If not found, it creates a new instance with all the pipeline options set to defaults. + + + +#### pipeline + +```python +@with_config(spec=PipelineConfiguration, auto_pipeline_section=True) +def pipeline(pipeline_name: str = None, + pipelines_dir: str = None, + pipeline_salt: TSecretValue = None, + destination: TDestinationReferenceArg = None, + staging: TDestinationReferenceArg = None, + dataset_name: str = None, + import_schema_path: str = None, + export_schema_path: str = None, + full_refresh: bool = False, + credentials: Any = None, + progress: TCollectorArg = _NULL_COLLECTOR, + **kwargs: Any) -> Pipeline +``` + +Creates a new instance of `dlt` pipeline, which moves the data from the source ie. a REST API to a destination ie. database or a data lake. + +Summary +The `pipeline` functions allows you to pass the destination name to which the data should be loaded, the name of the dataset and several other options that govern loading of the data. +The created `Pipeline` object lets you load the data from any source with `run` method or to have more granular control over the loading process with `extract`, `normalize` and `load` methods. + +Please refer to the following doc pages +- Write your first pipeline walkthrough: https://dlthub.com/docs/walkthroughs/create-a-pipeline +- Pipeline architecture and data loading steps: https://dlthub.com/docs/reference +- List of supported destinations: https://dlthub.com/docs/dlt-ecosystem/destinations + +**Arguments**: + +- `pipeline_name` _str, optional_ - A name of the pipeline that will be used to identify it in monitoring events and to restore its state and data schemas on subsequent runs. + Defaults to the file name of pipeline script with `dlt_` prefix added. + +- `pipelines_dir` _str, optional_ - A working directory in which pipeline state and temporary files will be stored. Defaults to user home directory: `~/dlt/pipelines/`. + +- `pipeline_salt` _TSecretValue, optional_ - A random value used for deterministic hashing during data anonymization. Defaults to a value derived from the pipeline name. + Default value should not be used for any cryptographic purposes. + +- `destination` _str | DestinationReference, optional_ - A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`. + May also be provided to `run` method of the `pipeline`. + +- `staging` _str | DestinationReference, optional_ - A name of the destination where dlt will stage the data before final loading, or a destination module imported from `dlt.destination`. + May also be provided to `run` method of the `pipeline`. + +- `dataset_name` _str, optional_ - A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. `schema` in relational databases or folder grouping many files. + May also be provided later to the `run` or `load` methods of the `Pipeline`. If not provided at all then defaults to the `pipeline_name` + +- `import_schema_path` _str, optional_ - A path from which the schema `yaml` file will be imported on each pipeline run. Defaults to None which disables importing. + +- `export_schema_path` _str, optional_ - A path where the schema `yaml` file will be exported after every schema change. Defaults to None which disables exporting. + +- `full_refresh` _bool, optional_ - When set to True, each instance of the pipeline with the `pipeline_name` starts from scratch when run and loads the data to a separate dataset. + The datasets are identified by `dataset_name_` + datetime suffix. Use this setting whenever you experiment with your data to be sure you start fresh on each run. Defaults to False. + +- `credentials` _Any, optional_ - Credentials for the `destination` ie. database connection string or a dictionary with google cloud credentials. + In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer right credentials values. + + progress(str, Collector): A progress monitor that shows progress bars, console or log messages with current information on sources, resources, data items etc. processed in + `extract`, `normalize` and `load` stage. Pass a string with a collector name or configure your own by choosing from `dlt.progress` module. + We support most of the progress libraries: try passing `tqdm`, `enlighten` or `alive_progress` or `log` to write to console/log. + + +**Returns**: + +- `Pipeline` - An instance of `Pipeline` class with. Please check the documentation of `run` method for information on what to do with it. + + + +#### attach + +```python +@with_config(spec=PipelineConfiguration, auto_pipeline_section=True) +def attach(pipeline_name: str = None, + pipelines_dir: str = None, + pipeline_salt: TSecretValue = None, + full_refresh: bool = False, + credentials: Any = None, + progress: TCollectorArg = _NULL_COLLECTOR, + **kwargs: Any) -> Pipeline +``` + +Attaches to the working folder of `pipeline_name` in `pipelines_dir` or in default directory. Requires that valid pipeline state exists in working folder. + + + +#### run + +```python +def run(data: Any, + *, + destination: TDestinationReferenceArg = None, + staging: TDestinationReferenceArg = None, + dataset_name: str = None, + credentials: Any = None, + table_name: str = None, + write_disposition: TWriteDisposition = None, + columns: Sequence[TColumnSchema] = None, + schema: Schema = None) -> LoadInfo +``` + +Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. + +Summary +This method will `extract` the data from the `data` argument, infer the schema, `normalize` the data into a load package (ie. jsonl or PARQUET files representing tables) and then `load` such packages into the `destination`. + +The data may be supplied in several forms: +* a `list` or `Iterable` of any JSON-serializable objects ie. `dlt.run([1, 2, 3], table_name="numbers")` +* any `Iterator` or a function that yield (`Generator`) ie. `dlt.run(range(1, 10), table_name="range")` +* a function or a list of functions decorated with @dlt.resource ie. `dlt.run([chess_players(title="GM"), chess_games()])` +* a function or a list of functions decorated with @dlt.source. + +Please note that `dlt` deals with `bytes`, `datetime`, `decimal` and `uuid` objects so you are free to load binary data or documents containing dates. + +Execution +The `run` method will first use `sync_destination` method to synchronize pipeline state and schemas with the destination. You can disable this behavior with `restore_from_destination` configuration option. +Next it will make sure that data from the previous is fully processed. If not, `run` method normalizes and loads pending data items. +Only then the new data from `data` argument is extracted, normalized and loaded. + +**Arguments**: + +- `data` _Any_ - Data to be loaded to destination + +- `destination` _str | DestinationReference, optional_ - A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`. + If not provided, the value passed to `dlt.pipeline` will be used. + + dataset_name (str, optional):A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. `schema` in relational databases or folder grouping many files. + If not provided, the value passed to `dlt.pipeline` will be used. If not provided at all then defaults to the `pipeline_name` + +- `credentials` _Any, optional_ - Credentials for the `destination` ie. database connection string or a dictionary with google cloud credentials. + In most cases should be set to None, which lets `dlt` to use `secrets.toml` or environment variables to infer right credentials values. + +- `table_name` _str, optional_ - The name of the table to which the data should be loaded within the `dataset`. This argument is required for a `data` that is a list/Iterable or Iterator without `__name__` attribute. + The behavior of this argument depends on the type of the `data`: + * generator functions: the function name is used as table name, `table_name` overrides this default + * `@dlt.resource`: resource contains the full table schema and that includes the table name. `table_name` will override this property. Use with care! + * `@dlt.source`: source contains several resources each with a table schema. `table_name` will override all table names within the source and load the data into single table. + +- `write_disposition` _Literal["skip", "append", "replace", "merge"], optional_ - Controls how to write data to a table. `append` will always add new data at the end of the table. `replace` will replace existing data with new data. `skip` will prevent data from loading. "merge" will deduplicate and merge data based on "primary_key" and "merge_key" hints. Defaults to "append". + Please note that in case of `dlt.resource` the table schema value will be overwritten and in case of `dlt.source`, the values in all resources will be overwritten. + +- `columns` _Sequence[TColumnSchema], optional_ - A list of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. + +- `schema` _Schema, optional_ - An explicit `Schema` object in which all table schemas will be grouped. By default `dlt` takes the schema from the source (if passed in `data` argument) or creates a default one itself. + + +**Raises**: + + PipelineStepFailed when a problem happened during `extract`, `normalize` or `load` steps. + +**Returns**: + +- `LoadInfo` - Information on loaded data including the list of package ids and failed job statuses. Please not that `dlt` will not raise if a single job terminally fails. Such information is provided via LoadInfo. + diff --git a/docs/website/docs/api_ref_sphinx/markdown/api.md b/docs/website/docs/api_ref_sphinx/markdown/api.md deleted file mode 100644 index 10af1453b3..0000000000 --- a/docs/website/docs/api_ref_sphinx/markdown/api.md +++ /dev/null @@ -1,96 +0,0 @@ -# API - -### dlt.pipeline.attach(pipeline_name: str = None, pipelines_dir: str = None, pipeline_salt: ~dlt.common.typing.TSecretValue = None, full_refresh: bool = False, credentials: ~typing.Any = None, progress: ~dlt.common.runtime.collector.Collector | ~typing.Literal['tqdm', 'enlighten', 'log', 'alive_progress'] = , \*\*kwargs: ~typing.Any) - -Attaches to the working folder of pipeline_name in pipelines_dir or in default directory. Requires that valid pipeline state exists in working folder. - -### dlt.pipeline.pipeline(pipeline_name: str = None, pipelines_dir: str = None, pipeline_salt: TSecretValue = None, destination: TDestinationReferenceArg = None, staging: TDestinationReferenceArg = None, dataset_name: str = None, import_schema_path: str = None, export_schema_path: str = None, full_refresh: bool = False, credentials: Any = None, progress: TCollectorArg = \_NULL_COLLECTOR) - -### dlt.pipeline.pipeline() - -Creates a new instance of dlt pipeline, which moves the data from the source ie. a REST API to a destination ie. database or a data lake. - -Summary -: The pipeline functions allows you to pass the destination name to which the data should be loaded, the name of the dataset and several other options that govern loading of the data. - The created Pipeline object lets you load the data from any source with run method or to have more granular control over the loading process with extract, normalize and load methods. - -Please refer to the following doc pages -: - Write your first pipeline walkthrough: [https://dlthub.com/docs/walkthroughs/create-a-pipeline](https://dlthub.com/docs/walkthroughs/create-a-pipeline) - - Pipeline architecture and data loading steps: [https://dlthub.com/docs/reference](https://dlthub.com/docs/reference) - - List of supported destinations: [https://dlthub.com/docs/dlt-ecosystem/destinations](https://dlthub.com/docs/dlt-ecosystem/destinations) - -* **Parameters:** - * **pipeline_name** (*May also be provided later to the run* *or* *load methods* *of* *the Pipeline. If not provided at all then defaults to the*) – A name of the pipeline that will be used to identify it in monitoring events and to restore its state and data schemas on subsequent runs. - * **added.** (*Defaults to the file name* *of* *pipeline script with dlt_ prefix*) – - * **pipelines_dir** (*str**,* *optional*) – A working directory in which pipeline state and temporary files will be stored. Defaults to user home directory: ~/dlt/pipelines/. - * **pipeline_salt** (*TSecretValue**,* *optional*) – A random value used for deterministic hashing during data anonymization. Defaults to a value derived from the pipeline name. - * **purposes.** (*Default value should not be used for any cryptographic*) – - * **destination** (*str* *|* *DestinationReference**,* *optional*) – A name of the destination to which dlt will load the data, or a destination module imported from dlt.destination. - * **pipeline.** (*May also be provided to run method* *of* *the*) – - * **staging** (*str* *|* *DestinationReference**,* *optional*) – A name of the destination where dlt will stage the data before final loading, or a destination module imported from dlt.destination. - * **pipeline.** – - * **dataset_name** (*str**,* *optional*) – A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. schema in relational databases or folder grouping many files. - * **pipeline_name** – - * **import_schema_path** (*str**,* *optional*) – A path from which the schema yaml file will be imported on each pipeline run. Defaults to None which disables importing. - * **export_schema_path** (*str**,* *optional*) – A path where the schema yaml file will be exported after every schema change. Defaults to None which disables exporting. - * **full_refresh** (*bool**,* *optional*) – When set to True, each instance of the pipeline with the pipeline_name starts from scratch when run and loads the data to a separate dataset. - * **False.** (*The datasets are identified by dataset_name_ + datetime suffix. Use this setting whenever you experiment with your data to be sure you start fresh on each run. Defaults to*) – - * **credentials** (*Any**,* *optional*) – Credentials for the destination ie. database connection string or a dictionary with google cloud credentials. - * **None** (*In most cases should be set to*) – - * **values.** (*which lets dlt to use secrets.toml* *or* *environment variables to infer right credentials*) – - * **progress** (*str**,* *Collector*) – A progress monitor that shows progress bars, console or log messages with current information on sources, resources, data items etc. processed in - * **extract** – - * **module.** (*normalize and load stage. Pass a string with a collector name* *or* *configure your own by choosing from dlt.progress*) – - * **libraries** (*We support most* *of* *the progress*) – try passing tqdm, enlighten or alive_progress or log to write to console/log. -* **Returns:** - An instance of Pipeline class with. Please check the documentation of run method for information on what to do with it. -* **Return type:** - Pipeline - -### dlt.pipeline.run(data: Any, \*, destination: DestinationReference | module | None | str = None, staging: DestinationReference | module | None | str = None, dataset_name: str | None = None, credentials: Any | None = None, table_name: str | None = None, write_disposition: Literal['skip', 'append', 'replace', 'merge'] | None = None, columns: Sequence[TColumnSchema] | None = None, schema: Schema | None = None) - -Loads the data in data argument into the destination specified in destination and dataset specified in dataset_name. - -Summary -: This method will extract the data from the data argument, infer the schema, normalize the data into a load package (ie. jsonl or PARQUET files representing tables) and then load such packages into the destination. - -The data may be supplied in several forms: -: * a list or Iterable of any JSON-serializable objects ie. dlt.run([1, 2, 3], table_name=”numbers”) - * any Iterator or a function that yield (Generator) ie. dlt.run(range(1, 10), table_name=”range”) - * a function or a list of functions decorated with @dlt.resource ie. dlt.run([chess_players(title=”GM”), chess_games()]) - * a function or a list of functions decorated with @dlt.source. - -Please note that dlt deals with bytes, datetime, decimal and uuid objects so you are free to load binary data or documents containing dates. - -Execution -: The run method will first use sync_destination method to synchronize pipeline state and schemas with the destination. You can disable this behavior with restore_from_destination configuration option. - Next it will make sure that data from the previous is fully processed. If not, run method normalizes and loads pending data items. - Only then the new data from data argument is extracted, normalized and loaded. - -* **Parameters:** - * **data** (*The behavior* *of* *this argument depends on the type* *of* *the*) – Data to be loaded to destination - * **destination** (*str* *|* *DestinationReference**,* *optional*) – A name of the destination to which dlt will load the data, or a destination module imported from dlt.destination. - * **provided** (*If not*) – - * **used.** (*the value passed to dlt.pipeline will be*) – - * **dataset_name** (*str**,* *optional*) – A name of the dataset to which the data will be loaded. A dataset is a logical group of tables ie. schema in relational databases or folder grouping many files. - * **provided** – - * **pipeline_name** (*the value passed to dlt.pipeline will be used. If not provided at all then defaults to the*) – - * **credentials** (*Any**,* *optional*) – Credentials for the destination ie. database connection string or a dictionary with google cloud credentials. - * **None** (*In most cases should be set to*) – - * **values.** (*which lets dlt to use secrets.toml* *or* *environment variables to infer right credentials*) – - * **table_name** (*str**,* *optional*) – The name of the table to which the data should be loaded within the dataset. This argument is required for a data that is a list/Iterable or Iterator without \_\_name_\_ attribute. - * **data** – - * **functions** (*\* generator*) – the function name is used as table name, table_name overrides this default - * **@dlt.resource** (*\**) – resource contains the full table schema and that includes the table name. table_name will override this property. Use with care! - * **@dlt.source** (*\**) – source contains several resources each with a table schema. table_name will override all table names within the source and load the data into single table. - * **write_disposition** (*Literal**[**"skip"**,* *"append"**,* *"replace"**,* *"merge"**]**,* *optional*) – Controls how to write data to a table. append will always add new data at the end of the table. replace will replace existing data with new data. skip will prevent data from loading. “merge” will deduplicate and merge data based on “primary_key” and “merge_key” hints. Defaults to “append”. - * **dlt.source** (*Please note that in case* *of* *dlt.resource the table schema value will be overwritten and in case of*) – - * **overwritten.** (*the values in all resources will be*) – - * **columns** (*Sequence**[**TColumnSchema**]**,* *optional*) – A list of column schemas. Typed dictionary describing column names, data types, write disposition and performance hints that gives you full control over the created table schema. - * **schema** (*Schema**,* *optional*) – An explicit Schema object in which all table schemas will be grouped. By default dlt takes the schema from the source (if passed in data argument) or creates a default one itself. -* **Raises:** - **PipelineStepFailed when a problem happened during extract****,** **normalize** **or** **load steps.** – -* **Returns:** - Information on loaded data including the list of package ids and failed job statuses. Please not that dlt will not raise if a single job terminally fails. Such information is provided via LoadInfo. -* **Return type:** - LoadInfo diff --git a/docs/website/docs/api_ref_sphinx/markdown/index.md b/docs/website/docs/api_ref_sphinx/markdown/index.md deleted file mode 100644 index c3b15484b4..0000000000 --- a/docs/website/docs/api_ref_sphinx/markdown/index.md +++ /dev/null @@ -1,19 +0,0 @@ - - -# Welcome to api_reference’s documentation! - -# Contents: - -* [API](api.md) - * [`attach()`](api.md#dlt.pipeline.attach) - * [`pipeline()`](api.md#dlt.pipeline.pipeline) - * [`run()`](api.md#dlt.pipeline.run) - -# Indices and tables - -* [Index](genindex.md) -* [Module Index](py-modindex.md) -* [Search Page](search.md) diff --git a/docs/website/sidebars.js b/docs/website/sidebars.js index 84af1cbdd7..3a11642e70 100644 --- a/docs/website/sidebars.js +++ b/docs/website/sidebars.js @@ -277,28 +277,14 @@ const sidebars = { }, items: [ { - type: 'category', + type: 'doc', label: 'pipeline_pdoc', - link: { - type: 'doc', - id: 'api_reference_pdoc/pipeline/index', - }, - items: [ - 'api_reference_pdoc/pipeline/dbt', - 'api_reference_pdoc/pipeline/exceptions', - ], + id: 'api_reference_pdoc/pipeline/index', }, { - type: 'category', - label: 'pipeline_sphinx', - link: { - type: 'doc', - id: 'api_ref_sphinx/markdown/index', - }, - items: [ - 'api_reference_pdoc/pipeline/dbt', - 'api_reference_pdoc/pipeline/exceptions', - ], + type: 'doc', + label: 'pipeline_pydoc', + id: 'api_ref_pydoc/index', }, ], }, From 66caf6c13dd06366914369c29eeece5b74e60a3f Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Wed, 13 Sep 2023 17:57:28 +0200 Subject: [PATCH 08/10] bring markdown titles back --- dlt/pipeline/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index bf4f57275f..7bdac3edee 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -222,7 +222,7 @@ def run( ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. - Summary + ### Summary This method will `extract` the data from the `data` argument, infer the schema, `normalize` the data into a load package (ie. jsonl or PARQUET files representing tables) and then `load` such packages into the `destination`. The data may be supplied in several forms: From 1048144d3bc74031c8bb249a384a67a3ad9b8b3e Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Wed, 13 Sep 2023 18:02:18 +0200 Subject: [PATCH 09/10] bring markdown titles back --- docs/website/docs/api_ref_pydoc/index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/website/docs/api_ref_pydoc/index.md b/docs/website/docs/api_ref_pydoc/index.md index 9bb885999f..8fa082f14c 100644 --- a/docs/website/docs/api_ref_pydoc/index.md +++ b/docs/website/docs/api_ref_pydoc/index.md @@ -196,7 +196,7 @@ def run(data: Any, Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. -Summary +### Summary This method will `extract` the data from the `data` argument, infer the schema, `normalize` the data into a load package (ie. jsonl or PARQUET files representing tables) and then `load` such packages into the `destination`. The data may be supplied in several forms: From 0efa65247ad023193ea0f8c508567209310de490 Mon Sep 17 00:00:00 2001 From: AstrakhantsevaAA Date: Thu, 14 Sep 2023 13:56:26 +0200 Subject: [PATCH 10/10] bring markdown titles back --- dlt/pipeline/__init__.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/dlt/pipeline/__init__.py b/dlt/pipeline/__init__.py index 7bdac3edee..d65d9896e2 100644 --- a/dlt/pipeline/__init__.py +++ b/dlt/pipeline/__init__.py @@ -29,7 +29,7 @@ def pipeline( ) -> Pipeline: """Creates a new instance of `dlt` pipeline, which moves the data from the source ie. a REST API to a destination ie. database or a data lake. - Summary + #### Summary The `pipeline` functions allows you to pass the destination name to which the data should be loaded, the name of the dataset and several other options that govern loading of the data. The created `Pipeline` object lets you load the data from any source with `run` method or to have more granular control over the loading process with `extract`, `normalize` and `load` methods. @@ -38,7 +38,7 @@ def pipeline( - Pipeline architecture and data loading steps: https://dlthub.com/docs/reference - List of supported destinations: https://dlthub.com/docs/dlt-ecosystem/destinations - Args: + #### Args: pipeline_name (str, optional): A name of the pipeline that will be used to identify it in monitoring events and to restore its state and data schemas on subsequent runs. Defaults to the file name of pipeline script with `dlt_` prefix added. @@ -222,7 +222,7 @@ def run( ) -> LoadInfo: """Loads the data in `data` argument into the destination specified in `destination` and dataset specified in `dataset_name`. - ### Summary + #### Summary This method will `extract` the data from the `data` argument, infer the schema, `normalize` the data into a load package (ie. jsonl or PARQUET files representing tables) and then `load` such packages into the `destination`. The data may be supplied in several forms: @@ -233,12 +233,12 @@ def run( Please note that `dlt` deals with `bytes`, `datetime`, `decimal` and `uuid` objects so you are free to load binary data or documents containing dates. - Execution + #### Execution The `run` method will first use `sync_destination` method to synchronize pipeline state and schemas with the destination. You can disable this behavior with `restore_from_destination` configuration option. Next it will make sure that data from the previous is fully processed. If not, `run` method normalizes and loads pending data items. Only then the new data from `data` argument is extracted, normalized and loaded. - Args: + #### Args: data (Any): Data to be loaded to destination destination (str | DestinationReference, optional): A name of the destination to which dlt will load the data, or a destination module imported from `dlt.destination`.