-
Notifications
You must be signed in to change notification settings - Fork 1.2k
perf: Parallelize DynamoDB batch reads in sync online_read #6024
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -16,6 +16,7 @@ | |
| import itertools | ||
| import logging | ||
| from collections import OrderedDict, defaultdict | ||
| from concurrent.futures import ThreadPoolExecutor | ||
| from datetime import datetime | ||
| from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Union | ||
|
|
||
|
|
@@ -479,33 +480,57 @@ def online_read( | |
| online_config.endpoint_url, | ||
| online_config.session_based_auth, | ||
| ) | ||
| table_instance = dynamodb_resource.Table( | ||
| _get_table_name(online_config, config, table) | ||
| ) | ||
| table_name = _get_table_name(online_config, config, table) | ||
| table_instance = dynamodb_resource.Table(table_name) | ||
|
|
||
| batch_size = online_config.batch_size | ||
| entity_ids = self._to_entity_ids(config, entity_keys) | ||
| entity_ids_iter = iter(entity_ids) | ||
| result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] | ||
|
|
||
| # Split entity_ids into batches upfront | ||
| batches: List[List[str]] = [] | ||
| entity_ids_iter = iter(entity_ids) | ||
| while True: | ||
| batch = list(itertools.islice(entity_ids_iter, batch_size)) | ||
|
|
||
| # No more items to insert | ||
| if len(batch) == 0: | ||
| if not batch: | ||
| break | ||
| batches.append(batch) | ||
|
|
||
| if not batches: | ||
| return [] | ||
|
|
||
| # For single batch, no parallelization overhead needed | ||
| if len(batches) == 1: | ||
| batch_entity_ids = self._to_resource_batch_get_payload( | ||
| online_config, table_instance.name, batch | ||
| online_config, table_instance.name, batches[0] | ||
| ) | ||
| response = dynamodb_resource.batch_get_item( | ||
| RequestItems=batch_entity_ids, | ||
| response = dynamodb_resource.batch_get_item(RequestItems=batch_entity_ids) | ||
| return self._process_batch_get_response(table_name, response, batches[0]) | ||
|
|
||
| # Execute batch requests in parallel for multiple batches | ||
| # Note: boto3 resources are NOT thread-safe, so we create a new resource per thread | ||
| def fetch_batch(batch: List[str]) -> Dict[str, Any]: | ||
| thread_resource = _initialize_dynamodb_resource( | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think creating here we are creating new session on each thread. Instead we can share a single |
||
| online_config.region, | ||
| online_config.endpoint_url, | ||
| online_config.session_based_auth, | ||
| ) | ||
| batch_result = self._process_batch_get_response( | ||
| table_instance.name, | ||
| response, | ||
| batch, | ||
| batch_entity_ids = self._to_resource_batch_get_payload( | ||
| online_config, table_name, batch | ||
| ) | ||
| return thread_resource.batch_get_item(RequestItems=batch_entity_ids) | ||
|
|
||
| # Use ThreadPoolExecutor for parallel I/O | ||
| # Cap at 10 workers to avoid excessive thread creation | ||
| max_workers = min(len(batches), 10) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this should be configurable ( |
||
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | ||
| responses = list(executor.map(fetch_batch, batches)) | ||
devin-ai-integration[bot] marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| # Process responses and merge results in order | ||
| result: List[Tuple[Optional[datetime], Optional[Dict[str, ValueProto]]]] = [] | ||
| for batch, response in zip(batches, responses): | ||
| batch_result = self._process_batch_get_response(table_name, response, batch) | ||
| result.extend(batch_result) | ||
|
|
||
| return result | ||
|
|
||
| async def online_read_async( | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
table_instance is only used in the single-batch path, and seems unnecessary since table_name can be used instead of table_instance.name