|
14 | 14 | # KIND, either express or implied. See the License for the |
15 | 15 | # specific language governing permissions and limitations |
16 | 16 | # under the License. |
| 17 | +from collections import deque |
17 | 18 | from enum import Enum |
18 | 19 | from typing import ( |
19 | 20 | TYPE_CHECKING, |
20 | 21 | Any, |
21 | 22 | Union, |
22 | 23 | ) |
23 | 24 |
|
24 | | -from pydantic import ConfigDict, Field, field_validator |
| 25 | +from pydantic import ConfigDict, Field, TypeAdapter, field_validator |
25 | 26 | from requests import HTTPError, Session |
26 | 27 | from tenacity import RetryCallState, retry, retry_if_exception_type, stop_after_attempt |
27 | 28 |
|
|
36 | 37 | ) |
37 | 38 | from pyiceberg.catalog.rest.auth import AuthManager, AuthManagerAdapter, AuthManagerFactory, LegacyOAuth2AuthManager |
38 | 39 | from pyiceberg.catalog.rest.response import _handle_non_200_response |
| 40 | +from pyiceberg.catalog.rest.scan_planning import ( |
| 41 | + FetchScanTasksRequest, |
| 42 | + PlanCancelled, |
| 43 | + PlanCompleted, |
| 44 | + PlanFailed, |
| 45 | + PlanningResponse, |
| 46 | + PlanSubmitted, |
| 47 | + PlanTableScanRequest, |
| 48 | + ScanTasks, |
| 49 | +) |
39 | 50 | from pyiceberg.exceptions import ( |
40 | 51 | AuthorizationExpiredError, |
41 | 52 | CommitFailedException, |
|
44 | 55 | NamespaceNotEmptyError, |
45 | 56 | NoSuchIdentifierError, |
46 | 57 | NoSuchNamespaceError, |
| 58 | + NoSuchPlanTaskError, |
47 | 59 | NoSuchTableError, |
48 | 60 | NoSuchViewError, |
49 | 61 | TableAlreadyExistsError, |
|
56 | 68 | CommitTableRequest, |
57 | 69 | CommitTableResponse, |
58 | 70 | CreateTableTransaction, |
| 71 | + FileScanTask, |
59 | 72 | StagedTable, |
60 | 73 | Table, |
61 | 74 | TableIdentifier, |
@@ -316,6 +329,9 @@ class ListViewsResponse(IcebergBaseModel): |
316 | 329 | identifiers: list[ListViewResponseEntry] = Field() |
317 | 330 |
|
318 | 331 |
|
| 332 | +_PLANNING_RESPONSE_ADAPTER = TypeAdapter(PlanningResponse) |
| 333 | + |
| 334 | + |
319 | 335 | class RestCatalog(Catalog): |
320 | 336 | uri: str |
321 | 337 | _session: Session |
@@ -375,15 +391,113 @@ def _create_session(self) -> Session: |
375 | 391 |
|
376 | 392 | return session |
377 | 393 |
|
378 | | - def is_rest_scan_planning_enabled(self) -> bool: |
379 | | - """Check if rest server-side scan planning is enabled. |
| 394 | + def supports_server_side_planning(self) -> bool: |
| 395 | + """Check if the catalog supports server-side scan planning.""" |
| 396 | + return Capability.V1_SUBMIT_TABLE_SCAN_PLAN in self._supported_endpoints and property_as_bool( |
| 397 | + self.properties, REST_SCAN_PLANNING_ENABLED, REST_SCAN_PLANNING_ENABLED_DEFAULT |
| 398 | + ) |
| 399 | + |
| 400 | + @retry(**_RETRY_ARGS) |
| 401 | + def _plan_table_scan(self, identifier: str | Identifier, request: PlanTableScanRequest) -> PlanningResponse: |
| 402 | + """Submit a scan plan request to the REST server. |
| 403 | +
|
| 404 | + Args: |
| 405 | + identifier: Table identifier. |
| 406 | + request: The scan plan request parameters. |
380 | 407 |
|
381 | 408 | Returns: |
382 | | - True if enabled, False otherwise. |
| 409 | + PlanningResponse the result of the scan plan request representing the status |
| 410 | +
|
| 411 | + Raises: |
| 412 | + NoSuchTableError: If a table with the given identifier does not exist. |
383 | 413 | """ |
384 | | - return Capability.V1_SUBMIT_TABLE_SCAN_PLAN in self._supported_endpoints and property_as_bool( |
385 | | - self.properties, REST_SCAN_PLANNING_ENABLED, REST_SCAN_PLANNING_ENABLED_DEFAULT |
| 414 | + self._check_endpoint(Capability.V1_SUBMIT_TABLE_SCAN_PLAN) |
| 415 | + response = self._session.post( |
| 416 | + self.url(Endpoints.plan_table_scan, prefixed=True, **self._split_identifier_for_path(identifier)), |
| 417 | + data=request.model_dump_json(by_alias=True, exclude_none=True).encode(UTF8), |
386 | 418 | ) |
| 419 | + try: |
| 420 | + response.raise_for_status() |
| 421 | + except HTTPError as exc: |
| 422 | + _handle_non_200_response(exc, {404: NoSuchTableError}) |
| 423 | + |
| 424 | + return _PLANNING_RESPONSE_ADAPTER.validate_json(response.text) |
| 425 | + |
| 426 | + @retry(**_RETRY_ARGS) |
| 427 | + def _fetch_scan_tasks(self, identifier: str | Identifier, plan_task: str) -> ScanTasks: |
| 428 | + """Fetch additional scan tasks using a plan task token. |
| 429 | +
|
| 430 | + Args: |
| 431 | + identifier: Table identifier. |
| 432 | + plan_task: The plan task token from a previous response. |
| 433 | +
|
| 434 | + Returns: |
| 435 | + ScanTasks containing file scan tasks and possibly more plan-task tokens. |
| 436 | +
|
| 437 | + Raises: |
| 438 | + NoSuchPlanTaskError: If a plan task with the given identifier or task does not exist. |
| 439 | + """ |
| 440 | + self._check_endpoint(Capability.V1_TABLE_SCAN_PLAN_TASKS) |
| 441 | + request = FetchScanTasksRequest(plan_task=plan_task) |
| 442 | + response = self._session.post( |
| 443 | + self.url(Endpoints.fetch_scan_tasks, prefixed=True, **self._split_identifier_for_path(identifier)), |
| 444 | + data=request.model_dump_json(by_alias=True).encode(UTF8), |
| 445 | + ) |
| 446 | + try: |
| 447 | + response.raise_for_status() |
| 448 | + except HTTPError as exc: |
| 449 | + _handle_non_200_response(exc, {404: NoSuchPlanTaskError}) |
| 450 | + |
| 451 | + return ScanTasks.model_validate_json(response.text) |
| 452 | + |
| 453 | + def plan_scan(self, identifier: str | Identifier, request: PlanTableScanRequest) -> list[FileScanTask]: |
| 454 | + """Plan a table scan and return FileScanTasks. |
| 455 | +
|
| 456 | + Handles the full scan planning lifecycle including pagination. |
| 457 | +
|
| 458 | + Args: |
| 459 | + identifier: Table identifier. |
| 460 | + request: The scan plan request parameters. |
| 461 | +
|
| 462 | + Returns: |
| 463 | + List of FileScanTask objects ready for execution. |
| 464 | +
|
| 465 | + Raises: |
| 466 | + RuntimeError: If planning fails, is cancelled, or returns unexpected response. |
| 467 | + NotImplementedError: If async planning is required but not yet supported. |
| 468 | + """ |
| 469 | + response = self._plan_table_scan(identifier, request) |
| 470 | + |
| 471 | + if isinstance(response, PlanFailed): |
| 472 | + error_msg = response.error.message if response.error else "unknown error" |
| 473 | + raise RuntimeError(f"Received status: failed: {error_msg}") |
| 474 | + |
| 475 | + if isinstance(response, PlanCancelled): |
| 476 | + raise RuntimeError("Received status: cancelled") |
| 477 | + |
| 478 | + if isinstance(response, PlanSubmitted): |
| 479 | + # TODO: implement polling for async planning |
| 480 | + raise NotImplementedError(f"Async scan planning not yet supported for planId: {response.plan_id}") |
| 481 | + |
| 482 | + if not isinstance(response, PlanCompleted): |
| 483 | + raise RuntimeError(f"Invalid planStatus for response: {type(response).__name__}") |
| 484 | + |
| 485 | + tasks: list[FileScanTask] = [] |
| 486 | + |
| 487 | + # Collect tasks from initial response |
| 488 | + for task in response.file_scan_tasks: |
| 489 | + tasks.append(FileScanTask.from_rest_response(task, response.delete_files)) |
| 490 | + |
| 491 | + # Fetch and collect from additional batches |
| 492 | + pending_tasks = deque(response.plan_tasks) |
| 493 | + while pending_tasks: |
| 494 | + plan_task = pending_tasks.popleft() |
| 495 | + batch = self._fetch_scan_tasks(identifier, plan_task) |
| 496 | + for task in batch.file_scan_tasks: |
| 497 | + tasks.append(FileScanTask.from_rest_response(task, batch.delete_files)) |
| 498 | + pending_tasks.extend(batch.plan_tasks) |
| 499 | + |
| 500 | + return tasks |
387 | 501 |
|
388 | 502 | def _create_legacy_oauth2_auth_manager(self, session: Session) -> AuthManager: |
389 | 503 | """Create the LegacyOAuth2AuthManager by fetching required properties. |
|
0 commit comments