Skip to content

Commit 834bf64

Browse files
authored
Merge pull request #31 from rdmueller/feature/structure-index
feat: Structure Index - In-Memory Document Index (Issue #5)
2 parents c5a0988 + 1c88beb commit 834bf64

File tree

4 files changed

+1151
-2
lines changed

4 files changed

+1151
-2
lines changed

src/docs/arc42/chapters/11_technical_risks.adoc

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,3 +38,27 @@ Technical debt consists of design or implementation choices that are expedient i
3838
| **Custom Parser (ADR-005)** | Building a custom parser is a significant undertaking and creates a complex component that must be maintained. | **Consequence**: High maintenance cost. **Repayment**: As AsciiDoc parsing libraries in Python mature, periodically re-evaluate whether this component can be replaced with a standard, community-maintained library.
3939
| **No File Watching** | The in-memory index (ADR-002) does not automatically update if files are changed externally (e.g., by a user in a text editor). | **Consequence**: The index can become stale, leading to incorrect data being served. **Repayment**: Implement a file-watching mechanism that triggers a re-indexing of changed files. This was deferred to reduce initial complexity.
4040
|===
41+
42+
=== Technical Debt Tracking
43+
44+
Implementation-related technical debt is tracked as GitHub issues with the `tech-debt` prefix. These issues document gaps between specification and implementation that are deferred for future work.
45+
46+
.Active Tech-Debt Issues
47+
[cols="1,3"]
48+
|===
49+
| Issue | Description
50+
51+
| https://github.com/rdmueller/AsciiDoc-MCP/issues/14[TD-ADOC-001]
52+
| AsciiDoc ifdef/ifndef conditional support
53+
54+
| https://github.com/rdmueller/AsciiDoc-MCP/issues/19[TD-ADOC-002]
55+
| Include options and attribute substitution in paths
56+
57+
| https://github.com/rdmueller/AsciiDoc-MCP/issues/28[tech-debt: Markdown Parser]
58+
| Additional features from spec (end_line, reserved frontmatter fields, content extraction)
59+
60+
| https://github.com/rdmueller/AsciiDoc-MCP/issues/32[tech-debt: Structure Index]
61+
| File→Sections mapping, rebuild() method, element index within section
62+
|===
63+
64+
NOTE: For the current list of all tech-debt issues, see the https://github.com/rdmueller/AsciiDoc-MCP/issues?q=is%3Aissue+is%3Aopen+tech-debt[GitHub Issues with tech-debt label].

src/mcp_server/structure_index.py

Lines changed: 355 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,355 @@
1+
"""Structure Index for fast document lookups.
2+
3+
This module provides an in-memory index for fast lookups of document
4+
structure and sections. It is built from parsed documents (AsciiDoc
5+
or Markdown) and supports the API endpoints defined in the specification.
6+
7+
Key features:
8+
- O(1) lookup by hierarchical path
9+
- Element filtering by type and section
10+
- Simple text search across section titles
11+
- Statistics for health checks
12+
"""
13+
14+
import logging
15+
from dataclasses import dataclass
16+
17+
from mcp_server.models import Document, Element, Section
18+
19+
logger = logging.getLogger(__name__)
20+
21+
22+
@dataclass
23+
class SearchResult:
24+
"""A search result with context.
25+
26+
Attributes:
27+
path: Hierarchical path to the match
28+
line: Line number in source file
29+
context: Matching text/context
30+
score: Relevance score (0-1)
31+
"""
32+
33+
path: str
34+
line: int
35+
context: str
36+
score: float
37+
38+
39+
class StructureIndex:
40+
"""In-memory index for fast document structure lookups.
41+
42+
This class builds and maintains an index of document structure
43+
from parsed AsciiDoc and Markdown documents. It provides fast
44+
lookups by path, level, and element type.
45+
46+
Attributes:
47+
_path_to_section: Mapping of hierarchical path to Section
48+
_level_to_sections: Mapping of level to list of Sections
49+
_elements: List of all elements
50+
_type_to_elements: Mapping of element type to list of Elements
51+
_section_to_elements: Mapping of section path to list of Elements
52+
_documents: List of indexed documents
53+
_index_ready: Whether the index has been built
54+
"""
55+
56+
def __init__(self) -> None:
57+
"""Initialize an empty index."""
58+
self._path_to_section: dict[str, Section] = {}
59+
self._level_to_sections: dict[int, list[Section]] = {}
60+
self._elements: list[Element] = []
61+
self._type_to_elements: dict[str, list[Element]] = {}
62+
self._section_to_elements: dict[str, list[Element]] = {}
63+
self._documents: list[Document] = []
64+
self._top_level_sections: list[Section] = []
65+
self._index_ready: bool = False
66+
67+
def build_from_documents(self, documents: list[Document]) -> list[str]:
68+
"""Build index from parsed documents.
69+
70+
Args:
71+
documents: List of parsed documents (AsciiDoc or Markdown)
72+
73+
Returns:
74+
List of warning messages (e.g., duplicate paths)
75+
"""
76+
warnings: list[str] = []
77+
78+
# Clear existing index
79+
self.clear()
80+
81+
self._documents = documents
82+
83+
# Index all sections and elements from each document
84+
for doc in documents:
85+
# Index sections recursively
86+
for section in doc.sections:
87+
self._top_level_sections.append(section)
88+
section_warnings = self._index_section(section)
89+
warnings.extend(section_warnings)
90+
91+
# Index elements
92+
for element in doc.elements:
93+
self._index_element(element)
94+
95+
self._index_ready = True
96+
logger.info(
97+
f"Index built: {len(self._path_to_section)} sections, "
98+
f"{len(self._elements)} elements from {len(documents)} documents"
99+
)
100+
101+
return warnings
102+
103+
def get_structure(self, max_depth: int | None = None) -> dict:
104+
"""Get hierarchical document structure.
105+
106+
Args:
107+
max_depth: Maximum depth to return (None for unlimited)
108+
109+
Returns:
110+
Dictionary with 'sections' (hierarchical tree) and 'total_sections'
111+
"""
112+
if max_depth is not None:
113+
sections = [
114+
self._section_to_dict(s, max_depth, current_depth=1)
115+
for s in self._top_level_sections
116+
]
117+
else:
118+
sections = [
119+
self._section_to_dict(s, max_depth=None, current_depth=1)
120+
for s in self._top_level_sections
121+
]
122+
123+
return {
124+
"sections": sections,
125+
"total_sections": len(self._path_to_section),
126+
}
127+
128+
def get_section(self, path: str) -> Section | None:
129+
"""Find section by hierarchical path.
130+
131+
Args:
132+
path: Hierarchical path (e.g., "/chapter-1/section-2")
133+
134+
Returns:
135+
Section if found, None otherwise
136+
"""
137+
return self._path_to_section.get(path)
138+
139+
def get_sections_at_level(self, level: int) -> list[Section]:
140+
"""Get all sections at a specific level.
141+
142+
Args:
143+
level: Nesting level (1 = chapter, 2 = section, etc.)
144+
145+
Returns:
146+
List of sections at the specified level
147+
"""
148+
return self._level_to_sections.get(level, [])
149+
150+
def get_elements(
151+
self,
152+
element_type: str | None = None,
153+
section_path: str | None = None,
154+
) -> list[Element]:
155+
"""Get elements, optionally filtered by type and/or section.
156+
157+
Args:
158+
element_type: Optional type filter (code, table, image, etc.)
159+
section_path: Optional section path filter
160+
161+
Returns:
162+
List of matching elements
163+
"""
164+
# Start with all elements or filtered by type
165+
if element_type is not None:
166+
elements = self._type_to_elements.get(element_type, [])
167+
else:
168+
elements = self._elements
169+
170+
# Further filter by section if specified
171+
if section_path is not None:
172+
elements = [e for e in elements if e.parent_section == section_path]
173+
174+
return elements
175+
176+
def search(
177+
self,
178+
query: str,
179+
scope: str | None = None,
180+
case_sensitive: bool = False,
181+
max_results: int = 50,
182+
) -> list[SearchResult]:
183+
"""Search for content matching query.
184+
185+
Currently searches section titles. Future versions may search
186+
full content.
187+
188+
Args:
189+
query: Search query string
190+
scope: Optional path prefix to limit search scope
191+
case_sensitive: Whether search is case-sensitive
192+
max_results: Maximum number of results to return
193+
194+
Returns:
195+
List of SearchResult objects
196+
"""
197+
results: list[SearchResult] = []
198+
199+
# Prepare query for matching
200+
search_query = query if case_sensitive else query.lower()
201+
202+
for path, section in self._path_to_section.items():
203+
# Check scope filter
204+
if scope is not None and not path.startswith(scope):
205+
continue
206+
207+
# Check title match
208+
title = section.title if case_sensitive else section.title.lower()
209+
if search_query in title:
210+
# Calculate simple relevance score based on match position
211+
match_pos = title.find(search_query)
212+
score = 1.0 - (match_pos / max(len(title), 1)) * 0.5
213+
214+
results.append(
215+
SearchResult(
216+
path=path,
217+
line=section.source_location.line,
218+
context=section.title,
219+
score=score,
220+
)
221+
)
222+
223+
# Sort by score descending
224+
results.sort(key=lambda r: r.score, reverse=True)
225+
226+
return results[:max_results]
227+
228+
def clear(self) -> None:
229+
"""Clear the index."""
230+
self._path_to_section.clear()
231+
self._level_to_sections.clear()
232+
self._elements.clear()
233+
self._type_to_elements.clear()
234+
self._section_to_elements.clear()
235+
self._documents.clear()
236+
self._top_level_sections.clear()
237+
self._index_ready = False
238+
239+
def stats(self) -> dict:
240+
"""Return index statistics.
241+
242+
Returns:
243+
Dictionary with index statistics for health checks
244+
"""
245+
return {
246+
"total_sections": len(self._path_to_section),
247+
"total_elements": len(self._elements),
248+
"total_documents": len(self._documents),
249+
"index_ready": self._index_ready,
250+
"sections_by_level": {
251+
level: len(sections)
252+
for level, sections in self._level_to_sections.items()
253+
},
254+
"elements_by_type": {
255+
etype: len(elements)
256+
for etype, elements in self._type_to_elements.items()
257+
},
258+
}
259+
260+
def _index_section(self, section: Section) -> list[str]:
261+
"""Index a section and its children recursively.
262+
263+
Args:
264+
section: Section to index
265+
266+
Returns:
267+
List of warning messages
268+
"""
269+
warnings: list[str] = []
270+
271+
# Check for duplicate path
272+
if section.path in self._path_to_section:
273+
warnings.append(
274+
f"Duplicate section path: '{section.path}' "
275+
f"(first at {self._path_to_section[section.path].source_location.file}:"
276+
f"{self._path_to_section[section.path].source_location.line}, "
277+
f"duplicate at {section.source_location.file}:{section.source_location.line})"
278+
)
279+
# Reject the duplicate - do not add to any index
280+
# Still index children recursively in case they have unique paths
281+
for child in section.children:
282+
child_warnings = self._index_section(child)
283+
warnings.extend(child_warnings)
284+
return warnings
285+
286+
# Index by path
287+
self._path_to_section[section.path] = section
288+
289+
# Index by level
290+
if section.level not in self._level_to_sections:
291+
self._level_to_sections[section.level] = []
292+
self._level_to_sections[section.level].append(section)
293+
294+
# Index children recursively
295+
for child in section.children:
296+
child_warnings = self._index_section(child)
297+
warnings.extend(child_warnings)
298+
299+
return warnings
300+
301+
def _index_element(self, element: Element) -> None:
302+
"""Index an element.
303+
304+
Args:
305+
element: Element to index
306+
"""
307+
# Add to all elements list
308+
self._elements.append(element)
309+
310+
# Index by type
311+
if element.type not in self._type_to_elements:
312+
self._type_to_elements[element.type] = []
313+
self._type_to_elements[element.type].append(element)
314+
315+
# Index by parent section
316+
if element.parent_section not in self._section_to_elements:
317+
self._section_to_elements[element.parent_section] = []
318+
self._section_to_elements[element.parent_section].append(element)
319+
320+
def _section_to_dict(
321+
self,
322+
section: Section,
323+
max_depth: int | None,
324+
current_depth: int,
325+
) -> dict:
326+
"""Convert a section to a dictionary for API response.
327+
328+
Args:
329+
section: Section to convert
330+
max_depth: Maximum depth to include children
331+
current_depth: Current depth in the tree
332+
333+
Returns:
334+
Dictionary representation of the section
335+
"""
336+
result = {
337+
"path": section.path,
338+
"title": section.title,
339+
"level": section.level,
340+
"location": {
341+
"file": str(section.source_location.file),
342+
"line": section.source_location.line,
343+
},
344+
}
345+
346+
# Include children based on max_depth
347+
if max_depth is None or current_depth < max_depth:
348+
result["children"] = [
349+
self._section_to_dict(child, max_depth, current_depth + 1)
350+
for child in section.children
351+
]
352+
else:
353+
result["children"] = []
354+
355+
return result

0 commit comments

Comments
 (0)