Skip to content

Commit 50a5b40

Browse files
update FilesExt retry logic (#1211)
## What changes are proposed in this pull request? **WHAT** - Extending retry function with a new parameter `max_attempt` to allow client to retry and fail after certain amount - Remove 500 from FilesExt retry status code - Add new config to set the retry attempts for FilesExt - Update the retry logic of FilesExt to fail after certain attempts. **WHY** - 500 errors shouldn't be retried - The FilesExt should always prioritize fallback over retry to avoid regression ## How is this tested? Unit tests were updated to reflect the change. --------- Co-authored-by: Parth Bansal <parth.bansal@databricks.com>
1 parent c284b9c commit 50a5b40

File tree

6 files changed

+129
-12
lines changed

6 files changed

+129
-12
lines changed

NEXT_CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,14 @@
44

55
### New Features and Improvements
66

7+
* FilesExt retry logic now respects a retry count limit in addition to the time-based timeout. Operations will stop retrying when either the retry count (`experimental_files_ext_cloud_api_max_retries`, default: 3) or timeout (`retry_timeout_seconds`) is exceeded, whichever comes first. This provides faster feedback when APIs are consistently unavailable.
8+
79
### Security
810

911
### Bug Fixes
1012

13+
* FilesExt no longer retries on 500 (Internal Server Error) responses. These errors now fail immediately or fallback to alternative upload methods as appropriate.
14+
1115
### Documentation
1216

1317
### Internal Changes

databricks/sdk/config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,11 @@ class Config:
233233
# Cap on the number of custom retries during parallel downloads.
234234
files_ext_parallel_download_max_retries = 3
235235

236+
# Maximum number of retry attempts for FilesExt cloud API operations.
237+
# This works in conjunction with retry_timeout_seconds - whichever limit
238+
# is hit first will stop the retry loop.
239+
experimental_files_ext_cloud_api_max_retries: int = 3
240+
236241
def __init__(
237242
self,
238243
*,

databricks/sdk/mixins/files.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -752,7 +752,7 @@ class FilesExt(files.FilesAPI):
752752
__doc__ = files.FilesAPI.__doc__
753753

754754
# note that these error codes are retryable only for idempotent operations
755-
_RETRYABLE_STATUS_CODES: list[int] = [408, 429, 500, 502, 503, 504]
755+
_RETRYABLE_STATUS_CODES: list[int] = [408, 429, 502, 503, 504]
756756

757757
@dataclass(frozen=True)
758758
class _UploadContext:
@@ -2340,6 +2340,7 @@ def extended_is_retryable(e: BaseException) -> Optional[str]:
23402340

23412341
return retried(
23422342
timeout=timedelta(seconds=retry_timeout_seconds),
2343+
max_attempts=self._config.experimental_files_ext_cloud_api_max_retries,
23432344
# also retry on network errors (connection error, connection timeout)
23442345
# where we believe request didn't reach the server
23452346
is_retryable=extended_is_retryable,

databricks/sdk/retries.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def retried(
1818
timeout=timedelta(minutes=20),
1919
clock: Optional[Clock] = None,
2020
before_retry: Optional[Callable] = None,
21+
max_attempts: Optional[int] = None,
2122
):
2223
has_allowlist = on is not None
2324
has_callback = is_retryable is not None
@@ -33,7 +34,7 @@ def wrapper(*args, **kwargs):
3334
deadline = clock.time() + timeout.total_seconds()
3435
attempt = 1
3536
last_err = None
36-
while clock.time() < deadline:
37+
while clock.time() < deadline and (max_attempts is None or attempt <= max_attempts):
3738
try:
3839
return func(*args, **kwargs)
3940
except Exception as err:
@@ -64,6 +65,10 @@ def wrapper(*args, **kwargs):
6465

6566
clock.sleep(sleep + random())
6667
attempt += 1
68+
69+
# Determine which limit was hit
70+
if max_attempts is not None and attempt > max_attempts:
71+
raise RuntimeError(f"Exceeded max retry attempts ({max_attempts})") from last_err
6772
raise TimeoutError(f"Timed out after {timeout}") from last_err
6873

6974
return wrapper

tests/test_files.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1076,9 +1076,9 @@ def run(self, config: Config, monkeypatch) -> None:
10761076
expected_download_api="files_api",
10771077
),
10781078
PresignedUrlDownloadTestCase(
1079-
name="Presigned URL download fails with 500 when downloading from URL",
1079+
name="Presigned URL download fails with 500 when downloading from URL, fallback to Files API",
10801080
file_size=100 * 1024 * 1024,
1081-
expected_exception_type=TimeoutError, # TimeoutError is raised after retries are exhausted
1081+
expected_download_api="files_api", # 500 is no longer retried, falls back immediately
10821082
custom_response_download_from_url=CustomResponse(code=500),
10831083
),
10841084
PresignedUrlDownloadTestCase(
@@ -1094,9 +1094,9 @@ def run(self, config: Config, monkeypatch) -> None:
10941094
),
10951095
# Recoverable errors
10961096
PresignedUrlDownloadTestCase(
1097-
name="Intermittent error should succeed after retry: Presigned URL download fails with 500 when downloading from URL",
1097+
name="Intermittent error should succeed after retry: Presigned URL download fails with 502 when downloading from URL",
10981098
file_size=100 * 1024 * 1024,
1099-
custom_response_download_from_url=CustomResponse(code=500, only_invocation=1),
1099+
custom_response_download_from_url=CustomResponse(code=502, only_invocation=1),
11001100
),
11011101
PresignedUrlDownloadTestCase(
11021102
name="Intermittent error should succeed after retry: Presigned URL expires with 403 when downloading from URL",
@@ -2181,38 +2181,38 @@ def to_string(test_case: "MultipartUploadTestCase") -> str:
21812181
content_size=100 * 1024 * 1024, # 10 parts
21822182
multipart_upload_part_size=10 * 1024 * 1024,
21832183
custom_response_on_upload=CustomResponse(exception=requests.ConnectionError, first_invocation=8),
2184-
expected_exception_type=TimeoutError,
2184+
expected_exception_type=RuntimeError,
21852185
expected_multipart_upload_aborted=True,
21862186
),
21872187
MultipartUploadTestCase(
21882188
"Upload part: permanent retryable status code",
21892189
content_size=100 * 1024 * 1024, # 10 parts
21902190
multipart_upload_part_size=10 * 1024 * 1024,
21912191
custom_response_on_upload=CustomResponse(code=429, first_invocation=8),
2192-
expected_exception_type=TimeoutError,
2192+
expected_exception_type=RuntimeError,
21932193
expected_multipart_upload_aborted=True,
21942194
),
21952195
MultipartUploadTestCase(
21962196
"Upload part: intermittent retryable error",
21972197
content_size=100 * 1024 * 1024, # 10 parts
21982198
multipart_upload_part_size=10 * 1024 * 1024,
21992199
custom_response_on_upload=CustomResponse(
2200-
exception=requests.ConnectionError, first_invocation=2, last_invocation=5
2200+
exception=requests.ConnectionError, first_invocation=2, last_invocation=3
22012201
),
22022202
expected_multipart_upload_aborted=False,
22032203
),
22042204
MultipartUploadTestCase(
22052205
"Upload part: intermittent retryable status code 429",
22062206
content_size=100 * 1024 * 1024, # 10 parts
22072207
multipart_upload_part_size=10 * 1024 * 1024,
2208-
custom_response_on_upload=CustomResponse(code=429, first_invocation=2, last_invocation=4),
2208+
custom_response_on_upload=CustomResponse(code=429, first_invocation=2, last_invocation=3),
22092209
expected_multipart_upload_aborted=False,
22102210
),
22112211
MultipartUploadTestCase(
2212-
"Upload chunk: intermittent retryable status code 500",
2212+
"Upload chunk: intermittent retryable status code 502",
22132213
content_size=100 * 1024 * 1024, # 10 parts
22142214
multipart_upload_part_size=10 * 1024 * 1024,
2215-
custom_response_on_upload=CustomResponse(code=500, first_invocation=2, last_invocation=4),
2215+
custom_response_on_upload=CustomResponse(code=502, first_invocation=2, last_invocation=3),
22162216
expected_multipart_upload_aborted=False,
22172217
),
22182218
# -------------------------- failures on abort --------------------------

tests/test_retries.py

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,3 +293,105 @@ def fn() -> Tuple[Any, Optional[RetryError]]:
293293
assert clock.time() >= min_time
294294
elif scenario in ("halt", "unexpected"):
295295
assert call_count == attempts
296+
297+
298+
@pytest.mark.parametrize(
299+
"timeout_seconds,max_attempts,fail_count,expected_error,expected_min_calls,expected_max_calls,expected_msg",
300+
[
301+
pytest.param(
302+
60,
303+
3,
304+
None,
305+
RuntimeError,
306+
3,
307+
3,
308+
"Exceeded max retry attempts (3)",
309+
id="max_attempts_respected",
310+
),
311+
pytest.param(
312+
5,
313+
100,
314+
None,
315+
TimeoutError,
316+
1,
317+
99,
318+
"Timed out after",
319+
id="timeout_exceeded_before_max_attempts",
320+
),
321+
pytest.param(
322+
60,
323+
5,
324+
2,
325+
None,
326+
3,
327+
3,
328+
None,
329+
id="successful_retry_within_max_attempts",
330+
),
331+
pytest.param(
332+
3,
333+
None,
334+
None,
335+
TimeoutError,
336+
2,
337+
None,
338+
"Timed out after",
339+
id="max_attempts_none",
340+
),
341+
],
342+
)
343+
def test_max_attempts_behavior(
344+
timeout_seconds, max_attempts, fail_count, expected_error, expected_min_calls, expected_max_calls, expected_msg
345+
):
346+
"""Test max_attempts parameter behavior with various configurations."""
347+
clock = FakeClock()
348+
call_count = 0
349+
350+
@retried(
351+
is_retryable=lambda _: "always retry",
352+
timeout=timedelta(seconds=timeout_seconds),
353+
max_attempts=max_attempts,
354+
clock=clock,
355+
)
356+
def test_function():
357+
nonlocal call_count
358+
call_count += 1
359+
if fail_count is None or call_count <= fail_count:
360+
raise ValueError("test error")
361+
return "success"
362+
363+
if expected_error:
364+
with pytest.raises(expected_error) as exc_info:
365+
test_function()
366+
if expected_msg:
367+
assert expected_msg in str(exc_info.value)
368+
else:
369+
result = test_function()
370+
assert result == "success"
371+
372+
assert call_count >= expected_min_calls
373+
if expected_max_calls:
374+
assert call_count <= expected_max_calls
375+
376+
377+
def test_max_attempts_with_non_retryable_error():
378+
"""Test that non-retryable errors are raised immediately regardless of max_attempts."""
379+
clock = FakeClock()
380+
call_count = 0
381+
382+
@retried(
383+
on=[ValueError],
384+
timeout=timedelta(seconds=60),
385+
max_attempts=5,
386+
clock=clock,
387+
)
388+
def raises_non_retryable():
389+
nonlocal call_count
390+
call_count += 1
391+
raise KeyError("not retryable")
392+
393+
with pytest.raises(KeyError):
394+
raises_non_retryable()
395+
396+
# Should only attempt once since error is not retryable.
397+
assert call_count == 1

0 commit comments

Comments
 (0)