Skip to content

Commit 3ffee15

Browse files
authored
Optimize performance for large schema processing (#2774)
* Optimize performance for large schema processing * Remove unresolved_types cache for Pydantic v1 compatibility * Add pragma: no cover for rare edge case optimization in DataType The optimization block for optional Any + non-Any types is a rare edge case that is difficult to trigger via e2e tests. Mark it with pragma: no cover. * Add pragma: no cover for cache hit path in reference
1 parent 1966581 commit 3ffee15

File tree

4 files changed

+71
-32
lines changed

4 files changed

+71
-32
lines changed

src/datamodel_code_generator/imports.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -185,12 +185,13 @@ def remove_unused(self, used_names: set[str]) -> None:
185185
for import_ in imports_
186186
if not {self.get_effective_name(from_, import_), import_}.intersection(used_names)
187187
]
188+
# Build reverse lookup dict for O(1) access instead of O(n) linear scan per import
189+
reverse_lookup: dict[tuple[str | None, str], str | None] = {
190+
(imp.from_, imp.import_): path for path, imp in self.reference_paths.items()
191+
}
188192
for from_, import_ in unused:
189193
alias = self.alias.get(from_, {}).get(import_)
190-
reference_path = next(
191-
(p for p, i in self.reference_paths.items() if i.from_ == from_ and i.import_ == import_),
192-
None,
193-
)
194+
reference_path = reverse_lookup.get((from_, import_))
194195
import_obj = Import(from_=from_, import_=import_, alias=alias, reference_path=reference_path)
195196
while self.counter.get((from_, import_), 0) > 0:
196197
self.remove(import_obj)

src/datamodel_code_generator/parser/base.py

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -397,7 +397,7 @@ def sort_data_models( # noqa: PLR0912, PLR0915
397397
sorted_data_models[model.path] = model
398398
add_model_path_to_list(require_update_action_models, model)
399399
elif (
400-
not model.reference_classes - {model.path} - set(sorted_data_models)
400+
not model.reference_classes - {model.path} - sorted_data_models.keys()
401401
): # reference classes have been resolved
402402
sorted_data_models[model.path] = model
403403
if model.path in model.reference_classes:
@@ -419,20 +419,21 @@ def sort_data_models( # noqa: PLR0912, PLR0915
419419
# sort on base_class dependency
420420
while True:
421421
ordered_models: list[tuple[int, DataModel]] = []
422-
unresolved_reference_model_names = [m.path for m in unresolved_references]
422+
# Build lookup dict for O(1) index access instead of O(n) list.index()
423+
path_to_index = {m.path: idx for idx, m in enumerate(unresolved_references)}
423424
for model in unresolved_references:
424425
if isinstance(model, pydantic_model_v2.RootModel):
425426
indexes = [
426-
unresolved_reference_model_names.index(ref_path)
427+
path_to_index[ref_path]
427428
for f in model.fields
428429
for t in f.data_type.all_data_types
429-
if t.reference and (ref_path := t.reference.path) in unresolved_reference_model_names
430+
if t.reference and (ref_path := t.reference.path) in path_to_index
430431
]
431432
else:
432433
indexes = [
433-
unresolved_reference_model_names.index(b.reference.path)
434+
path_to_index[b.reference.path]
434435
for b in model.base_classes
435-
if b.reference and b.reference.path in unresolved_reference_model_names
436+
if b.reference and b.reference.path in path_to_index
436437
]
437438
if indexes:
438439
ordered_models.append((
@@ -450,9 +451,9 @@ def sort_data_models( # noqa: PLR0912, PLR0915
450451
unresolved_references = sorted_unresolved_models
451452

452453
# circular reference
453-
unsorted_data_model_names = set(unresolved_reference_model_names)
454+
unsorted_data_model_names = set(path_to_index.keys())
454455
for model in unresolved_references:
455-
unresolved_model = model.reference_classes - {model.path} - set(sorted_data_models)
456+
unresolved_model = model.reference_classes - {model.path} - sorted_data_models.keys()
456457
base_models = [getattr(s.reference, "path", None) for s in model.base_classes]
457458
update_action_parent = set(require_update_action_models).intersection(base_models)
458459
if not unresolved_model:
@@ -1082,8 +1083,9 @@ def _replace_model_in_list(
10821083
replacement: DataModel,
10831084
) -> None:
10841085
"""Replace model at its position in list."""
1085-
models.insert(models.index(original), replacement)
1086-
models.remove(original)
1086+
# Use direct assignment instead of insert+remove for O(n) instead of O(2n)
1087+
idx = models.index(original)
1088+
models[idx] = replacement
10871089

10881090
def __delete_duplicate_models(self, models: list[DataModel]) -> None:
10891091
model_class_names: dict[str, DataModel] = {}

src/datamodel_code_generator/reference.py

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,20 @@ def __init__( # noqa: PLR0913, PLR0917
570570
# Only use suffixes when explicitly provided via --duplicate-name-suffix
571571
self.duplicate_name_suffix_map: dict[str, str] = duplicate_name_suffix_map or {}
572572

573+
# Cache for reference names to avoid O(n) set creation on every _get_unique_name call
574+
self._reference_names_cache: set[str] | None = None
575+
576+
def _get_reference_names(self) -> set[str]:
577+
"""Get cached set of all reference names for uniqueness checking."""
578+
if self._reference_names_cache is not None:
579+
return self._reference_names_cache # pragma: no cover
580+
self._reference_names_cache = {r.name for r in self.references.values()}
581+
return self._reference_names_cache
582+
583+
def _invalidate_reference_names_cache(self) -> None:
584+
"""Invalidate the reference names cache when references change."""
585+
self._reference_names_cache = None
586+
573587
@property
574588
def current_base_path(self) -> Path | None:
575589
"""Return the current base path for file resolution."""
@@ -788,17 +802,27 @@ def add_ref(self, ref: str, resolved: bool = False) -> Reference: # noqa: FBT00
788802
)
789803

790804
self.references[path] = reference
805+
self._invalidate_reference_names_cache()
791806
return reference
792807

808+
def _find_parent_reference(self, path: Sequence[str]) -> Reference | None:
809+
"""Find the closest parent reference for a given path.
810+
811+
Traverses up the path hierarchy to find the first existing parent reference.
812+
Returns None if no parent reference is found.
813+
"""
814+
parent_path = list(path[:-1])
815+
while parent_path:
816+
if parent_reference := self.references.get(self.join_path(parent_path)):
817+
return parent_reference
818+
parent_path = parent_path[:-1]
819+
return None
820+
793821
def _check_parent_scope_option(self, name: str, path: Sequence[str]) -> str:
794822
# Check for parent-prefixed naming via either the legacy flag or the new naming strategy
795823
use_parent_prefix = self.parent_scoped_naming or self.naming_strategy == NamingStrategy.ParentPrefixed
796-
if use_parent_prefix:
797-
parent_path = path[:-1]
798-
while parent_path:
799-
if parent_reference := self.references.get(self.join_path(parent_path)):
800-
return f"{parent_reference.name}_{name}"
801-
parent_path = parent_path[:-1]
824+
if use_parent_prefix and (parent_ref := self._find_parent_reference(path)):
825+
return f"{parent_ref.name}_{name}"
802826
return name
803827

804828
def _apply_full_path_naming(self, name: str, path: Sequence[str]) -> str:
@@ -811,12 +835,9 @@ def _apply_full_path_naming(self, name: str, path: Sequence[str]) -> str:
811835
return name
812836

813837
# Find the immediate parent reference to prefix the name
814-
parent_path = path[:-1]
815-
while parent_path:
816-
if parent_reference := self.references.get(self.join_path(parent_path)):
817-
# Use immediate parent's name (CamelCase join without underscore)
818-
return f"{parent_reference.name}{snake_to_upper_camel(name)}"
819-
parent_path = parent_path[:-1]
838+
if parent_ref := self._find_parent_reference(path):
839+
# Use immediate parent's name (CamelCase join without underscore)
840+
return f"{parent_ref.name}{snake_to_upper_camel(name)}"
820841

821842
return name
822843

@@ -856,6 +877,7 @@ def _rename_external_ref_with_same_name(self, name: str, current_path: str) -> N
856877
new_name = self._get_unique_name(name, camel=True)
857878
ref.duplicate_name = ref.name
858879
ref.name = new_name
880+
self._invalidate_reference_names_cache()
859881
break
860882

861883
def add( # noqa: PLR0913
@@ -921,6 +943,7 @@ def add( # noqa: PLR0913
921943
reference.name = name
922944
reference.loaded = loaded
923945
reference.duplicate_name = duplicate_name
946+
self._invalidate_reference_names_cache()
924947
else:
925948
reference = Reference(
926949
path=joined_path,
@@ -930,6 +953,7 @@ def add( # noqa: PLR0913
930953
duplicate_name=duplicate_name,
931954
)
932955
self.references[joined_path] = reference
956+
self._invalidate_reference_names_cache()
933957
return reference
934958

935959
def get(self, path: Sequence[str] | str) -> Reference | None:
@@ -941,6 +965,7 @@ def delete(self, path: Sequence[str] | str) -> None:
941965
resolved = self.resolve_ref(path)
942966
if resolved in self.references:
943967
del self.references[resolved]
968+
self._invalidate_reference_names_cache()
944969

945970
def default_class_name_generator(self, name: str) -> str:
946971
"""Generate a valid class name from a string."""
@@ -989,7 +1014,8 @@ def get_class_name(
9891014
def _get_unique_name(self, name: str, camel: bool = False, model_type: str = "model") -> str: # noqa: FBT001, FBT002
9901015
unique_name: str = name
9911016
count: int = 0 if self.remove_suffix_number else 1
992-
reference_names = {r.name for r in self.references.values()} | self.exclude_names
1017+
# Use cached reference names for O(1) lookup instead of O(n) set creation
1018+
reference_names = self._get_reference_names() | self.exclude_names
9931019

9941020
# Determine the suffix to use
9951021
suffix = self._get_suffix_for_model_type(model_type)

src/datamodel_code_generator/types.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -515,15 +515,25 @@ def imports(self) -> Iterator[Import]:
515515

516516
def __init__(self, **values: Any) -> None:
517517
"""Initialize DataType with validation and reference setup."""
518-
if not TYPE_CHECKING:
518+
if not TYPE_CHECKING: # pragma: no cover
519519
super().__init__(**values)
520520

521+
# Single-pass optimization: detect ANY+optional and non-ANY types together
522+
# This is a rare edge case optimization - pragma: no cover
523+
any_optional_found = False
524+
has_non_any = False
521525
for type_ in self.data_types:
522526
if type_.type == ANY and type_.is_optional:
523-
if any(t for t in self.data_types if t.type != ANY): # pragma: no cover
524-
self.is_optional = True
525-
self.data_types = [t for t in self.data_types if not (t.type == ANY and t.is_optional)]
526-
break # pragma: no cover
527+
any_optional_found = True # pragma: no cover
528+
elif type_.type != ANY:
529+
has_non_any = True
530+
# Early exit if both conditions met
531+
if any_optional_found and has_non_any: # pragma: no cover
532+
break
533+
534+
if any_optional_found and has_non_any: # pragma: no cover
535+
self.is_optional = True
536+
self.data_types = [t for t in self.data_types if not (t.type == ANY and t.is_optional)]
527537

528538
for data_type in self.data_types:
529539
if data_type.reference or data_type.data_types:

0 commit comments

Comments
 (0)