diff --git a/tap_rest_api_msdk/pagination.py b/tap_rest_api_msdk/pagination.py index 0f51660..c6e3fa4 100644 --- a/tap_rest_api_msdk/pagination.py +++ b/tap_rest_api_msdk/pagination.py @@ -168,23 +168,18 @@ def get_next_url(self, response: requests.Response) -> Optional[Any]: if not response.links.get("next", {}).get("url"): return None - resp_json = response.json() - if isinstance(resp_json, list): - results = resp_json - else: - results = resp_json.get("items") - - # Exit early if the response has no items. ? Maybe duplicative the "next" link - # check. - if not results: - return None - # Unfortunately endpoints such as /starred, /stargazers, /events and /pulls do # not support the "since" parameter out of the box. So we use a workaround here # to exit early. For such streams, we sort by descending dates (most recent # first), and paginate "back in time" until we reach records before our # "fake_since" parameter. if self.replication_key and self.use_fake_since_parameter: + resp_json = response.json() + if isinstance(resp_json, list): + results = resp_json + else: + results = resp_json.get("items") + request_parameters = parse_qs(str(urlparse(response.request.url).query)) # parse_qs interprets "+" as a space, revert this to keep an aware datetime try: @@ -217,10 +212,5 @@ def get_next_url(self, response: requests.Response) -> Optional[Any]: ): return None - # Use header links returned by the API to return the query parameters. - parsed_url = urlparse(response.links["next"]["url"]) - - if parsed_url.query: - return parsed_url.query - - return None + # Use header links returned by the API to return the next URL. + return response.links["next"]["url"] diff --git a/tap_rest_api_msdk/streams.py b/tap_rest_api_msdk/streams.py index d54dfdb..859193d 100644 --- a/tap_rest_api_msdk/streams.py +++ b/tap_rest_api_msdk/streams.py @@ -473,7 +473,10 @@ def _get_url_params_header_link( limit_per_page_param = "per_page" params[limit_per_page_param] = pagination_page_size if next_page_token: - request_parameters = parse_qs(str(next_page_token)) + # next_page_token is a ParseResult object from BaseHATEOASPaginator + # Extract the query string properly + query_string = next_page_token.query if hasattr(next_page_token, 'query') else str(next_page_token) + request_parameters = parse_qs(query_string) for k, v in request_parameters.items(): params[k] = v diff --git a/tests/test_streams.py b/tests/test_streams.py index bc2f94b..e2e98d4 100644 --- a/tests/test_streams.py +++ b/tests/test_streams.py @@ -133,3 +133,82 @@ def second_matcher(request): json_resp()["records"][0], json_resp()["records"][1], ] + + +def test_header_link_pagination_with_non_items_key(requests_mock): + """Test that header link pagination works with response keys other than 'items'. + + This validates the fix for Bug #1 where RestAPIHeaderLinkPaginator would + exit early if the response didn't contain an 'items' key. + """ + cfg = config({ + "pagination_request_style": "restapi_header_link_paginator", + "pagination_response_style": "header_link", + "streams": [ + { + "name": "test_stream", + "path": "/test", + "primary_keys": ["id"], + "records_path": "$.data[*]", # Using 'data' instead of 'items' + } + ], + }) + + # First page with Link header + requests_mock.get( + "https://example.com/test", + headers={"Link": '; rel="next"'}, + json={"data": [{"id": 1}, {"id": 2}]}, + ) + + # Second page without Link header + requests_mock.get( + "https://example.com/test?page=2", + json={"data": [{"id": 3}]}, + ) + + stream0 = TapRestApiMsdk(config=cfg, parse_env_config=True).discover_streams()[0] + records = list(stream0.get_records({})) + + assert len(records) == 3 + assert records[0]["id"] == 1 + assert records[2]["id"] == 3 + + +def test_header_link_pagination_backward_compatibility(requests_mock): + """Test that header link pagination still works with 'items' key (backward compatibility). + + This ensures the fix doesn't break existing implementations. + """ + cfg = config({ + "pagination_request_style": "restapi_header_link_paginator", + "pagination_response_style": "header_link", + "streams": [ + { + "name": "test_stream", + "path": "/test", + "primary_keys": ["id"], + "records_path": "$.items[*]", + } + ], + }) + + # First page with Link header + requests_mock.get( + "https://example.com/test", + headers={"Link": '; rel="next"'}, + json={"items": [{"id": 1}, {"id": 2}]}, + ) + + # Second page without Link header + requests_mock.get( + "https://example.com/test?page=2", + json={"items": [{"id": 3}]}, + ) + + stream0 = TapRestApiMsdk(config=cfg, parse_env_config=True).discover_streams()[0] + records = list(stream0.get_records({})) + + assert len(records) == 3 + assert records[0]["id"] == 1 + assert records[2]["id"] == 3