Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 8 additions & 18 deletions tap_rest_api_msdk/pagination.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,23 +168,18 @@ def get_next_url(self, response: requests.Response) -> Optional[Any]:
if not response.links.get("next", {}).get("url"):
return None

resp_json = response.json()
if isinstance(resp_json, list):
results = resp_json
else:
results = resp_json.get("items")

# Exit early if the response has no items. ? Maybe duplicative the "next" link
# check.
if not results:
return None

# Unfortunately endpoints such as /starred, /stargazers, /events and /pulls do
# not support the "since" parameter out of the box. So we use a workaround here
# to exit early. For such streams, we sort by descending dates (most recent
# first), and paginate "back in time" until we reach records before our
# "fake_since" parameter.
if self.replication_key and self.use_fake_since_parameter:
resp_json = response.json()
if isinstance(resp_json, list):
results = resp_json
else:
results = resp_json.get("items")

request_parameters = parse_qs(str(urlparse(response.request.url).query))
# parse_qs interprets "+" as a space, revert this to keep an aware datetime
try:
Expand Down Expand Up @@ -217,10 +212,5 @@ def get_next_url(self, response: requests.Response) -> Optional[Any]:
):
return None

# Use header links returned by the API to return the query parameters.
parsed_url = urlparse(response.links["next"]["url"])

if parsed_url.query:
return parsed_url.query

return None
# Use header links returned by the API to return the next URL.
return response.links["next"]["url"]
5 changes: 4 additions & 1 deletion tap_rest_api_msdk/streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,10 @@ def _get_url_params_header_link(
limit_per_page_param = "per_page"
params[limit_per_page_param] = pagination_page_size
if next_page_token:
request_parameters = parse_qs(str(next_page_token))
# next_page_token is a ParseResult object from BaseHATEOASPaginator
# Extract the query string properly
query_string = next_page_token.query if hasattr(next_page_token, 'query') else str(next_page_token)
request_parameters = parse_qs(query_string)
for k, v in request_parameters.items():
params[k] = v

Expand Down
79 changes: 79 additions & 0 deletions tests/test_streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,3 +133,82 @@ def second_matcher(request):
json_resp()["records"][0],
json_resp()["records"][1],
]


def test_header_link_pagination_with_non_items_key(requests_mock):
"""Test that header link pagination works with response keys other than 'items'.

This validates the fix for Bug #1 where RestAPIHeaderLinkPaginator would
exit early if the response didn't contain an 'items' key.
"""
cfg = config({
"pagination_request_style": "restapi_header_link_paginator",
"pagination_response_style": "header_link",
"streams": [
{
"name": "test_stream",
"path": "/test",
"primary_keys": ["id"],
"records_path": "$.data[*]", # Using 'data' instead of 'items'
}
],
})

# First page with Link header
requests_mock.get(
"https://example.com/test",
headers={"Link": '<https://example.com/test?page=2>; rel="next"'},
json={"data": [{"id": 1}, {"id": 2}]},
)

# Second page without Link header
requests_mock.get(
"https://example.com/test?page=2",
json={"data": [{"id": 3}]},
)

stream0 = TapRestApiMsdk(config=cfg, parse_env_config=True).discover_streams()[0]
records = list(stream0.get_records({}))

assert len(records) == 3
assert records[0]["id"] == 1
assert records[2]["id"] == 3


def test_header_link_pagination_backward_compatibility(requests_mock):
"""Test that header link pagination still works with 'items' key (backward compatibility).

This ensures the fix doesn't break existing implementations.
"""
cfg = config({
"pagination_request_style": "restapi_header_link_paginator",
"pagination_response_style": "header_link",
"streams": [
{
"name": "test_stream",
"path": "/test",
"primary_keys": ["id"],
"records_path": "$.items[*]",
}
],
})

# First page with Link header
requests_mock.get(
"https://example.com/test",
headers={"Link": '<https://example.com/test?page=2>; rel="next"'},
json={"items": [{"id": 1}, {"id": 2}]},
)

# Second page without Link header
requests_mock.get(
"https://example.com/test?page=2",
json={"items": [{"id": 3}]},
)

stream0 = TapRestApiMsdk(config=cfg, parse_env_config=True).discover_streams()[0]
records = list(stream0.get_records({}))

assert len(records) == 3
assert records[0]["id"] == 1
assert records[2]["id"] == 3