Skip to content

Commit 15584c9

Browse files
raifdmuellerclaude
authored andcommitted
fix: duplicate paths for dotted filenames + missing validation warnings (#266, #268)
#266: Files with dots in names (e.g. version numbers like v0.4.27.md) got duplicate paths because Path.with_suffix("") stripped version-like suffixes. Fix: strip only known doc extensions (.md, .adoc, .asciidoc) via new strip_doc_extension() utility. Also pass base_path to MarkdownStructureParser in CLI and MCP server. #268: Duplicate-path warnings from index building were only logged to stderr, not included in validate_structure JSON output. Fix: store build warnings in StructureIndex._build_warnings and include them as "duplicate_path" type in validation results. Bumps version to 0.4.30. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f2428c1 commit 15584c9

File tree

12 files changed

+260
-15
lines changed

12 files changed

+260
-15
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "dacli"
3-
version = "0.4.29"
3+
version = "0.4.30"
44
description = "Documentation Access CLI - Navigate and query large documentation projects"
55
readme = "README.md"
66
license = { text = "MIT" }

src/dacli/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@
55
"""
66

77

8-
__version__ = "0.4.29"
8+
__version__ = "0.4.30"

src/dacli/asciidoc_parser.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
collect_all_sections,
2424
find_section_by_path,
2525
slugify,
26+
strip_doc_extension,
2627
)
2728

2829
# Regex patterns from specification
@@ -189,6 +190,9 @@ def _get_file_prefix(self, file_path: Path) -> str:
189190
The file prefix is the relative path from base_path to file_path,
190191
without the file extension. This ensures unique paths across documents.
191192
193+
Issue #266: Only strips known extensions (.md, .adoc) to preserve dots
194+
in filenames (e.g. version numbers like "report_v1.2.3.adoc").
195+
192196
Args:
193197
file_path: Path to the document being parsed
194198
@@ -198,10 +202,8 @@ def _get_file_prefix(self, file_path: Path) -> str:
198202
try:
199203
relative = file_path.relative_to(self.base_path)
200204
except ValueError:
201-
# file_path is not relative to base_path, use just the stem
202-
relative = Path(file_path.stem)
203-
# Remove extension and convert to forward slashes
204-
return str(relative.with_suffix("")).replace("\\", "/")
205+
relative = Path(file_path.name)
206+
return strip_doc_extension(relative)
205207

206208
def get_section(
207209
self, doc: AsciidocDocument, path: str

src/dacli/cli.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -284,7 +284,7 @@ def __init__(
284284
self.index = StructureIndex()
285285
self.file_handler = FileSystemHandler()
286286
self.asciidoc_parser = AsciidocStructureParser(base_path=docs_root)
287-
self.markdown_parser = MarkdownStructureParser()
287+
self.markdown_parser = MarkdownStructureParser(base_path=docs_root)
288288

289289
# Build index
290290
_build_index(

src/dacli/markdown_parser.py

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
collect_all_sections,
2424
find_section_by_path,
2525
slugify,
26+
strip_doc_extension,
2627
)
2728

2829
logger = logging.getLogger(__name__)
@@ -118,6 +119,9 @@ def _get_file_prefix(self, file_path: Path) -> str:
118119
The file prefix is the relative path from base_path to file_path,
119120
without the file extension. This ensures unique paths across documents.
120121
122+
Issue #266: Only strips known extensions (.md, .adoc) to preserve dots
123+
in filenames (e.g. version numbers like "report_v1.2.3.md").
124+
121125
Args:
122126
file_path: Path to the document being parsed
123127
@@ -128,13 +132,10 @@ def _get_file_prefix(self, file_path: Path) -> str:
128132
try:
129133
relative = file_path.relative_to(self.base_path)
130134
except ValueError:
131-
# file_path is not relative to base_path, use just the stem
132-
relative = Path(file_path.stem)
135+
relative = Path(file_path.name)
133136
else:
134-
# No base_path provided, use just the stem
135-
relative = Path(file_path.stem)
136-
# Remove extension and convert to forward slashes
137-
return str(relative.with_suffix("")).replace("\\", "/")
137+
relative = Path(file_path.name)
138+
return strip_doc_extension(relative)
138139

139140
def parse_file(self, file_path: Path) -> MarkdownDocument:
140141
"""Parse a single Markdown file.

src/dacli/mcp_app.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def create_mcp_server(
134134
index = StructureIndex()
135135
file_handler = FileSystemHandler()
136136
asciidoc_parser = AsciidocStructureParser(base_path=docs_root)
137-
markdown_parser = MarkdownStructureParser()
137+
markdown_parser = MarkdownStructureParser(base_path=docs_root)
138138

139139
# Build initial index
140140
_build_index(

src/dacli/parser_utils.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,34 @@
55
"""
66

77
import re
8+
from pathlib import Path
89

910
from dacli.models import Section
1011

12+
# Known document extensions to strip from file paths (Issue #266)
13+
KNOWN_DOC_EXTENSIONS = {".md", ".adoc", ".asciidoc"}
14+
15+
16+
def strip_doc_extension(file_path: Path) -> str:
17+
"""Remove only known document extensions from a file path.
18+
19+
Unlike Path.with_suffix(""), this only removes known extensions (.md, .adoc,
20+
.asciidoc) and preserves dots that are part of the filename (e.g. version
21+
numbers like "report_v1.2.3.md" → "report_v1.2.3").
22+
23+
Args:
24+
file_path: Path to strip extension from
25+
26+
Returns:
27+
String path with known extension removed, using forward slashes.
28+
"""
29+
path_str = str(file_path).replace("\\", "/")
30+
suffix = file_path.suffix.lower()
31+
if suffix in KNOWN_DOC_EXTENSIONS:
32+
# Remove only the last suffix if it's a known doc extension
33+
return path_str[: -len(file_path.suffix)]
34+
return path_str
35+
1136

1237
def slugify(text: str) -> str:
1338
"""Convert text to URL-friendly slug.

src/dacli/services/validation_service.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,20 @@ def validate_structure(index: StructureIndex, docs_root: Path) -> dict:
9090
"message": pw.message,
9191
})
9292

93+
# Issue #268: Include duplicate-path warnings from index build
94+
for build_warning in index._build_warnings:
95+
if "Duplicate section path" in build_warning:
96+
# Parse the warning string to extract the path
97+
# Format: "Duplicate section path: 'path' (first at file:line, duplicate at file:line)"
98+
import re
99+
match = re.search(r"Duplicate section path: '([^']+)'", build_warning)
100+
dup_path = match.group(1) if match else "unknown"
101+
warnings.append({
102+
"type": "duplicate_path",
103+
"path": dup_path,
104+
"message": build_warning,
105+
})
106+
93107
# Issue #219: Check for unresolved includes
94108
for doc in index._documents:
95109
# Only AsciiDoc documents have includes (check for attribute)

src/dacli/structure_index.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ def __init__(self) -> None:
6969
self._documents: list[Document] = []
7070
self._top_level_sections: list[Section] = []
7171
self._circular_include_errors: list[dict] = []
72+
self._build_warnings: list[str] = [] # Issue #268: Store duplicate path warnings
7273
self._index_ready: bool = False
7374

7475
def build_from_documents(self, documents: list[Document]) -> list[str]:
@@ -99,6 +100,7 @@ def build_from_documents(self, documents: list[Document]) -> list[str]:
99100
for element in doc.elements:
100101
self._index_element(element)
101102

103+
self._build_warnings = warnings # Issue #268: Store for validation
102104
self._index_ready = True
103105
logger.info(
104106
f"Index built: {len(self._path_to_section)} sections, "
@@ -494,6 +496,7 @@ def clear(self) -> None:
494496
self._documents.clear()
495497
self._top_level_sections.clear()
496498
self._circular_include_errors.clear()
499+
self._build_warnings.clear()
497500
self._index_ready = False
498501

499502
def stats(self) -> dict:

tests/test_dotted_filenames_266.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
"""Tests for dotted filenames producing unique paths (Issue #266)."""
2+
3+
from dacli.markdown_parser import MarkdownStructureParser
4+
from dacli.structure_index import StructureIndex
5+
6+
7+
class TestDottedFilenames:
8+
"""Files with dots in names (e.g. version numbers) must have unique paths."""
9+
10+
def test_version_numbered_files_have_unique_paths(self, tmp_path):
11+
"""DACLI_TEST_RESULTS_v0.4.27.md and v0.4.28.md must not collide."""
12+
f1 = tmp_path / "RESULTS_v0.4.27.md"
13+
f2 = tmp_path / "RESULTS_v0.4.28.md"
14+
f1.write_text("# Results v0.4.27\n\nContent.\n")
15+
f2.write_text("# Results v0.4.28\n\nContent.\n")
16+
17+
parser = MarkdownStructureParser(base_path=tmp_path)
18+
doc1 = parser.parse_file(f1)
19+
doc2 = parser.parse_file(f2)
20+
21+
index = StructureIndex()
22+
warnings = index.build_from_documents([doc1, doc2])
23+
24+
assert len(warnings) == 0, f"Unexpected warnings: {warnings}"
25+
26+
structure = index.get_structure()
27+
paths = [s["path"] for s in structure["sections"]]
28+
assert len(paths) == len(set(paths)), f"Duplicate paths found: {paths}"
29+
30+
def test_cli_context_passes_base_path_to_markdown_parser(self, tmp_path):
31+
"""CliContext must pass docs_root as base_path to MarkdownStructureParser."""
32+
from dacli.cli import CliContext
33+
34+
f1 = tmp_path / "test_v1.2.3.md"
35+
f1.write_text("# Test v1.2.3\n")
36+
37+
ctx = CliContext(
38+
docs_root=tmp_path,
39+
output_format="json",
40+
pretty=False,
41+
)
42+
# The markdown parser should have base_path set
43+
assert ctx.markdown_parser.base_path == tmp_path
44+
45+
def test_file_prefix_without_base_path_strips_only_known_extensions(self, tmp_path):
46+
"""Without base_path, only .md extension should be stripped, not version dots."""
47+
f1 = tmp_path / "report_v2.1.5.md"
48+
f1.write_text("# Report\n")
49+
50+
# BUG #266: Without base_path, Path(stem).with_suffix("") strips ".5"
51+
parser = MarkdownStructureParser() # No base_path!
52+
doc = parser.parse_file(f1)
53+
54+
# The path should preserve the full version number
55+
assert doc.sections[0].path == "report_v2.1.5"
56+
57+
def test_get_file_prefix_preserves_version_dots(self, tmp_path):
58+
"""_get_file_prefix must not strip version-like suffixes."""
59+
parser = MarkdownStructureParser(base_path=tmp_path)
60+
prefix = parser._get_file_prefix(tmp_path / "data_v3.2.1.md")
61+
assert prefix == "data_v3.2.1"
62+
63+
def test_subdirectory_file_with_dots(self, tmp_path):
64+
"""Files with dots in subdirectories also get correct paths."""
65+
sub = tmp_path / "reports"
66+
sub.mkdir()
67+
f1 = sub / "sprint_2.0.1.md"
68+
f1.write_text("# Sprint 2.0.1\n\nNotes.\n")
69+
70+
parser = MarkdownStructureParser(base_path=tmp_path)
71+
doc = parser.parse_file(f1)
72+
73+
assert doc.sections[0].path == "reports/sprint_2.0.1"
74+
75+
76+
class TestDottedFilenamesAsciiDoc:
77+
"""AsciiDoc files with dots in names must also have unique paths."""
78+
79+
def test_asciidoc_file_with_version_dots(self, tmp_path):
80+
"""AsciiDoc _get_file_prefix must preserve version dots."""
81+
from dacli.asciidoc_parser import AsciidocStructureParser
82+
83+
f1 = tmp_path / "release_v1.2.3.adoc"
84+
f1.write_text("= Release v1.2.3\n\nContent.\n")
85+
86+
parser = AsciidocStructureParser(base_path=tmp_path)
87+
doc = parser.parse_file(f1)
88+
89+
assert doc.sections[0].path == "release_v1.2.3"

0 commit comments

Comments
 (0)