Skip to content

Commit

Permalink
Enhance BaseNextUrlPaginator to support relative next URLs in pagination
Browse files Browse the repository at this point in the history
  • Loading branch information
burnash committed Mar 31, 2024
1 parent f7dead9 commit b080d34
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 1 deletion.
7 changes: 7 additions & 0 deletions dlt/sources/helpers/rest_client/paginators.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from abc import ABC, abstractmethod
from typing import Optional
from urllib.parse import urlparse, urljoin

from dlt.sources.helpers.requests import Response, Request
from dlt.common import jsonpath
Expand Down Expand Up @@ -102,6 +103,12 @@ def update_request(self, request: Request) -> None:

class BaseNextUrlPaginator(BasePaginator):
def update_request(self, request: Request) -> None:
# Handle relative URLs
if self.next_reference:
parsed_url = urlparse(self.next_reference)
if not parsed_url.scheme:
self.next_reference = urljoin(request.url, self.next_reference)

request.url = self.next_reference


Expand Down
17 changes: 17 additions & 0 deletions tests/sources/helpers/rest_client/test_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ def test_default_paginator(self, rest_client: RESTClient):

assert_pagination(pages)

def test_excplicit_paginator(self, rest_client: RESTClient):
pages_iter = rest_client.paginate(
"/posts", paginator=JSONResponsePaginator(next_url_path="next_page")
)
pages = list(pages_iter)

assert_pagination(pages)

def test_excplicit_paginator_relative_next_url(self, rest_client: RESTClient):
pages_iter = rest_client.paginate(
"/posts_relative_next_url",
paginator=JSONResponsePaginator(next_url_path="next_page"),
)
pages = list(pages_iter)

assert_pagination(pages)

def test_paginate_with_hooks(self, rest_client: RESTClient):
def response_hook(response: Response, *args: Any, **kwargs: Any) -> None:
if response.status_code == 404:
Expand Down
69 changes: 68 additions & 1 deletion tests/sources/helpers/rest_client/test_paginators.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import pytest

from requests.models import Response
from requests.models import Response, Request

from dlt.sources.helpers.rest_client.paginators import (
SinglePagePaginator,
Expand Down Expand Up @@ -89,6 +89,73 @@ def test_update_state(self, test_case):
assert paginator.next_reference == test_case["expected"]["next_reference"]
assert paginator.has_next_page == test_case["expected"]["has_next_page"]

# Test update_request from BaseNextUrlPaginator
@pytest.mark.parametrize(
"test_case",
[
# Test with absolute URL
{
"next_reference": "http://example.com/api/resource?page=2",
"request_url": "http://example.com/api/resource",
"expected": "http://example.com/api/resource?page=2",
},
# Test with relative URL
{
"next_reference": "/api/resource?page=2",
"request_url": "http://example.com/api/resource",
"expected": "http://example.com/api/resource?page=2",
},
# Test with more nested path
{
"next_reference": "/api/resource/subresource?page=3&sort=desc",
"request_url": "http://example.com/api/resource/subresource",
"expected": "http://example.com/api/resource/subresource?page=3&sort=desc",
},
# Test with 'page' in path
{
"next_reference": "/api/page/4/items?filter=active",
"request_url": "http://example.com/api/page/3/items",
"expected": "http://example.com/api/page/4/items?filter=active",
},
# Test with complex query parameters
{
"next_reference": "/api/resource?page=3&category=books&sort=author",
"request_url": "http://example.com/api/resource?page=2",
"expected": "http://example.com/api/resource?page=3&category=books&sort=author",
},
# Test with URL having port number
{
"next_reference": "/api/resource?page=2",
"request_url": "http://example.com:8080/api/resource",
"expected": "http://example.com:8080/api/resource?page=2",
},
# Test with HTTPS protocol
{
"next_reference": "https://secure.example.com/api/resource?page=2",
"request_url": "https://secure.example.com/api/resource",
"expected": "https://secure.example.com/api/resource?page=2",
},
# Test with encoded characters in URL
{
"next_reference": "/api/resource?page=2&query=%E3%81%82",
"request_url": "http://example.com/api/resource",
"expected": "http://example.com/api/resource?page=2&query=%E3%81%82",
},
# Test with missing 'page' parameter in next_reference
{
"next_reference": "/api/resource?sort=asc",
"request_url": "http://example.com/api/resource?page=1",
"expected": "http://example.com/api/resource?sort=asc",
},
],
)
def test_update_request(self, test_case):
paginator = JSONResponsePaginator()
paginator.next_reference = test_case["next_reference"]
request = Mock(Request, url=test_case["request_url"])
paginator.update_request(request)
assert request.url == test_case["expected"]


class TestSinglePagePaginator:
def test_update_state(self):
Expand Down

0 comments on commit b080d34

Please sign in to comment.