diff --git a/dlt/sources/helpers/rest_client/paginators.py b/dlt/sources/helpers/rest_client/paginators.py index c098ea667f..48dfdf6e4f 100644 --- a/dlt/sources/helpers/rest_client/paginators.py +++ b/dlt/sources/helpers/rest_client/paginators.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from typing import Optional +from urllib.parse import urlparse, urljoin from dlt.sources.helpers.requests import Response, Request from dlt.common import jsonpath @@ -102,6 +103,12 @@ def update_request(self, request: Request) -> None: class BaseNextUrlPaginator(BasePaginator): def update_request(self, request: Request) -> None: + # Handle relative URLs + if self.next_reference: + parsed_url = urlparse(self.next_reference) + if not parsed_url.scheme: + self.next_reference = urljoin(request.url, self.next_reference) + request.url = self.next_reference diff --git a/tests/sources/helpers/rest_client/test_client.py b/tests/sources/helpers/rest_client/test_client.py index 7a4c55f9a6..88653efefe 100644 --- a/tests/sources/helpers/rest_client/test_client.py +++ b/tests/sources/helpers/rest_client/test_client.py @@ -74,6 +74,23 @@ def test_default_paginator(self, rest_client: RESTClient): assert_pagination(pages) + def test_excplicit_paginator(self, rest_client: RESTClient): + pages_iter = rest_client.paginate( + "/posts", paginator=JSONResponsePaginator(next_url_path="next_page") + ) + pages = list(pages_iter) + + assert_pagination(pages) + + def test_excplicit_paginator_relative_next_url(self, rest_client: RESTClient): + pages_iter = rest_client.paginate( + "/posts_relative_next_url", + paginator=JSONResponsePaginator(next_url_path="next_page"), + ) + pages = list(pages_iter) + + assert_pagination(pages) + def test_paginate_with_hooks(self, rest_client: RESTClient): def response_hook(response: Response, *args: Any, **kwargs: Any) -> None: if response.status_code == 404: diff --git a/tests/sources/helpers/rest_client/test_paginators.py b/tests/sources/helpers/rest_client/test_paginators.py index 64cfb8a9e2..bd38a2e421 100644 --- a/tests/sources/helpers/rest_client/test_paginators.py +++ b/tests/sources/helpers/rest_client/test_paginators.py @@ -2,7 +2,7 @@ import pytest -from requests.models import Response +from requests.models import Response, Request from dlt.sources.helpers.rest_client.paginators import ( SinglePagePaginator, @@ -89,6 +89,73 @@ def test_update_state(self, test_case): assert paginator.next_reference == test_case["expected"]["next_reference"] assert paginator.has_next_page == test_case["expected"]["has_next_page"] + # Test update_request from BaseNextUrlPaginator + @pytest.mark.parametrize( + "test_case", + [ + # Test with absolute URL + { + "next_reference": "http://example.com/api/resource?page=2", + "request_url": "http://example.com/api/resource", + "expected": "http://example.com/api/resource?page=2", + }, + # Test with relative URL + { + "next_reference": "/api/resource?page=2", + "request_url": "http://example.com/api/resource", + "expected": "http://example.com/api/resource?page=2", + }, + # Test with more nested path + { + "next_reference": "/api/resource/subresource?page=3&sort=desc", + "request_url": "http://example.com/api/resource/subresource", + "expected": "http://example.com/api/resource/subresource?page=3&sort=desc", + }, + # Test with 'page' in path + { + "next_reference": "/api/page/4/items?filter=active", + "request_url": "http://example.com/api/page/3/items", + "expected": "http://example.com/api/page/4/items?filter=active", + }, + # Test with complex query parameters + { + "next_reference": "/api/resource?page=3&category=books&sort=author", + "request_url": "http://example.com/api/resource?page=2", + "expected": "http://example.com/api/resource?page=3&category=books&sort=author", + }, + # Test with URL having port number + { + "next_reference": "/api/resource?page=2", + "request_url": "http://example.com:8080/api/resource", + "expected": "http://example.com:8080/api/resource?page=2", + }, + # Test with HTTPS protocol + { + "next_reference": "https://secure.example.com/api/resource?page=2", + "request_url": "https://secure.example.com/api/resource", + "expected": "https://secure.example.com/api/resource?page=2", + }, + # Test with encoded characters in URL + { + "next_reference": "/api/resource?page=2&query=%E3%81%82", + "request_url": "http://example.com/api/resource", + "expected": "http://example.com/api/resource?page=2&query=%E3%81%82", + }, + # Test with missing 'page' parameter in next_reference + { + "next_reference": "/api/resource?sort=asc", + "request_url": "http://example.com/api/resource?page=1", + "expected": "http://example.com/api/resource?sort=asc", + }, + ], + ) + def test_update_request(self, test_case): + paginator = JSONResponsePaginator() + paginator.next_reference = test_case["next_reference"] + request = Mock(Request, url=test_case["request_url"]) + paginator.update_request(request) + assert request.url == test_case["expected"] + class TestSinglePagePaginator: def test_update_state(self):