fix: duplicate paths for dotted filenames + missing validation warnings (#266, #268)

raifdmueller · claude · rdmueller · commit 15584c901876 · 2026-02-07T22:45:40.000+01:00
#266: Files with dots in names (e.g. version numbers like v0.4.27.md) got duplicate paths because Path.with_suffix("") stripped version-like suffixes. Fix: strip only known doc extensions (.md, .adoc, .asciidoc) via new strip_doc_extension() utility. Also pass base_path to MarkdownStructureParser in CLI and MCP server. #268: Duplicate-path warnings from index building were only logged to stderr, not included in validate_structure JSON output. Fix: store build warnings in StructureIndex._build_warnings and include them as "duplicate_path" type in validation results. Bumps version to 0.4.30. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "dacli"
-version = "0.4.29"
+version = "0.4.30"
 description = "Documentation Access CLI - Navigate and query large documentation projects"
 readme = "README.md"
 license = { text = "MIT" }
diff --git a/src/dacli/__init__.py b/src/dacli/__init__.py
@@ -5,4 +5,4 @@
 """
 
 
-__version__ = "0.4.29"
+__version__ = "0.4.30"
diff --git a/src/dacli/asciidoc_parser.py b/src/dacli/asciidoc_parser.py
@@ -23,6 +23,7 @@
     collect_all_sections,
     find_section_by_path,
     slugify,
+    strip_doc_extension,
 )
 
 # Regex patterns from specification
@@ -189,6 +190,9 @@ def _get_file_prefix(self, file_path: Path) -> str:
         The file prefix is the relative path from base_path to file_path,
         without the file extension. This ensures unique paths across documents.
 
+        Issue #266: Only strips known extensions (.md, .adoc) to preserve dots
+        in filenames (e.g. version numbers like "report_v1.2.3.adoc").
+
         Args:
             file_path: Path to the document being parsed
 
@@ -198,10 +202,8 @@ def _get_file_prefix(self, file_path: Path) -> str:
         try:
             relative = file_path.relative_to(self.base_path)
         except ValueError:
-            # file_path is not relative to base_path, use just the stem
-            relative = Path(file_path.stem)
-        # Remove extension and convert to forward slashes
-        return str(relative.with_suffix("")).replace("\\", "/")
+            relative = Path(file_path.name)
+        return strip_doc_extension(relative)
 
     def get_section(
         self, doc: AsciidocDocument, path: str
diff --git a/src/dacli/cli.py b/src/dacli/cli.py
@@ -284,7 +284,7 @@ def __init__(
         self.index = StructureIndex()
         self.file_handler = FileSystemHandler()
         self.asciidoc_parser = AsciidocStructureParser(base_path=docs_root)
-        self.markdown_parser = MarkdownStructureParser()
+        self.markdown_parser = MarkdownStructureParser(base_path=docs_root)
 
         # Build index
         _build_index(
diff --git a/src/dacli/markdown_parser.py b/src/dacli/markdown_parser.py
@@ -23,6 +23,7 @@
     collect_all_sections,
     find_section_by_path,
     slugify,
+    strip_doc_extension,
 )
 
 logger = logging.getLogger(__name__)
@@ -118,6 +119,9 @@ def _get_file_prefix(self, file_path: Path) -> str:
         The file prefix is the relative path from base_path to file_path,
         without the file extension. This ensures unique paths across documents.
 
+        Issue #266: Only strips known extensions (.md, .adoc) to preserve dots
+        in filenames (e.g. version numbers like "report_v1.2.3.md").
+
         Args:
             file_path: Path to the document being parsed
 
@@ -128,13 +132,10 @@ def _get_file_prefix(self, file_path: Path) -> str:
             try:
                 relative = file_path.relative_to(self.base_path)
             except ValueError:
-                # file_path is not relative to base_path, use just the stem
-                relative = Path(file_path.stem)
+                relative = Path(file_path.name)
         else:
-            # No base_path provided, use just the stem
-            relative = Path(file_path.stem)
-        # Remove extension and convert to forward slashes
-        return str(relative.with_suffix("")).replace("\\", "/")
+            relative = Path(file_path.name)
+        return strip_doc_extension(relative)
 
     def parse_file(self, file_path: Path) -> MarkdownDocument:
         """Parse a single Markdown file.
diff --git a/src/dacli/mcp_app.py b/src/dacli/mcp_app.py
@@ -134,7 +134,7 @@ def create_mcp_server(
     index = StructureIndex()
     file_handler = FileSystemHandler()
     asciidoc_parser = AsciidocStructureParser(base_path=docs_root)
-    markdown_parser = MarkdownStructureParser()
+    markdown_parser = MarkdownStructureParser(base_path=docs_root)
 
     # Build initial index
     _build_index(
diff --git a/src/dacli/parser_utils.py b/src/dacli/parser_utils.py
@@ -5,9 +5,34 @@
 """
 
 import re
+from pathlib import Path
 
 from dacli.models import Section
 
+# Known document extensions to strip from file paths (Issue #266)
+KNOWN_DOC_EXTENSIONS = {".md", ".adoc", ".asciidoc"}
+
+
+def strip_doc_extension(file_path: Path) -> str:
+    """Remove only known document extensions from a file path.
+
+    Unlike Path.with_suffix(""), this only removes known extensions (.md, .adoc,
+    .asciidoc) and preserves dots that are part of the filename (e.g. version
+    numbers like "report_v1.2.3.md" → "report_v1.2.3").
+
+    Args:
+        file_path: Path to strip extension from
+
+    Returns:
+        String path with known extension removed, using forward slashes.
+    """
+    path_str = str(file_path).replace("\\", "/")
+    suffix = file_path.suffix.lower()
+    if suffix in KNOWN_DOC_EXTENSIONS:
+        # Remove only the last suffix if it's a known doc extension
+        return path_str[: -len(file_path.suffix)]
+    return path_str
+
 
 def slugify(text: str) -> str:
     """Convert text to URL-friendly slug.
diff --git a/src/dacli/services/validation_service.py b/src/dacli/services/validation_service.py
@@ -90,6 +90,20 @@ def validate_structure(index: StructureIndex, docs_root: Path) -> dict:
                 "message": pw.message,
             })
 
+    # Issue #268: Include duplicate-path warnings from index build
+    for build_warning in index._build_warnings:
+        if "Duplicate section path" in build_warning:
+            # Parse the warning string to extract the path
+            # Format: "Duplicate section path: 'path' (first at file:line, duplicate at file:line)"
+            import re
+            match = re.search(r"Duplicate section path: '([^']+)'", build_warning)
+            dup_path = match.group(1) if match else "unknown"
+            warnings.append({
+                "type": "duplicate_path",
+                "path": dup_path,
+                "message": build_warning,
+            })
+
     # Issue #219: Check for unresolved includes
     for doc in index._documents:
         # Only AsciiDoc documents have includes (check for attribute)
diff --git a/src/dacli/structure_index.py b/src/dacli/structure_index.py
@@ -69,6 +69,7 @@ def __init__(self) -> None:
         self._documents: list[Document] = []
         self._top_level_sections: list[Section] = []
         self._circular_include_errors: list[dict] = []
+        self._build_warnings: list[str] = []  # Issue #268: Store duplicate path warnings
         self._index_ready: bool = False
 
     def build_from_documents(self, documents: list[Document]) -> list[str]:
@@ -99,6 +100,7 @@ def build_from_documents(self, documents: list[Document]) -> list[str]:
             for element in doc.elements:
                 self._index_element(element)
 
+        self._build_warnings = warnings  # Issue #268: Store for validation
         self._index_ready = True
         logger.info(
             f"Index built: {len(self._path_to_section)} sections, "
@@ -494,6 +496,7 @@ def clear(self) -> None:
         self._documents.clear()
         self._top_level_sections.clear()
         self._circular_include_errors.clear()
+        self._build_warnings.clear()
         self._index_ready = False
 
     def stats(self) -> dict:
diff --git a/tests/test_dotted_filenames_266.py b/tests/test_dotted_filenames_266.py
@@ -0,0 +1,89 @@
+"""Tests for dotted filenames producing unique paths (Issue #266)."""
+
+from dacli.markdown_parser import MarkdownStructureParser
+from dacli.structure_index import StructureIndex
+
+
+class TestDottedFilenames:
+    """Files with dots in names (e.g. version numbers) must have unique paths."""
+
+    def test_version_numbered_files_have_unique_paths(self, tmp_path):
+        """DACLI_TEST_RESULTS_v0.4.27.md and v0.4.28.md must not collide."""
+        f1 = tmp_path / "RESULTS_v0.4.27.md"
+        f2 = tmp_path / "RESULTS_v0.4.28.md"
+        f1.write_text("# Results v0.4.27\n\nContent.\n")
+        f2.write_text("# Results v0.4.28\n\nContent.\n")
+
+        parser = MarkdownStructureParser(base_path=tmp_path)
+        doc1 = parser.parse_file(f1)
+        doc2 = parser.parse_file(f2)
+
+        index = StructureIndex()
+        warnings = index.build_from_documents([doc1, doc2])
+
+        assert len(warnings) == 0, f"Unexpected warnings: {warnings}"
+
+        structure = index.get_structure()
+        paths = [s["path"] for s in structure["sections"]]
+        assert len(paths) == len(set(paths)), f"Duplicate paths found: {paths}"
+
+    def test_cli_context_passes_base_path_to_markdown_parser(self, tmp_path):
+        """CliContext must pass docs_root as base_path to MarkdownStructureParser."""
+        from dacli.cli import CliContext
+
+        f1 = tmp_path / "test_v1.2.3.md"
+        f1.write_text("# Test v1.2.3\n")
+
+        ctx = CliContext(
+            docs_root=tmp_path,
+            output_format="json",
+            pretty=False,
+        )
+        # The markdown parser should have base_path set
+        assert ctx.markdown_parser.base_path == tmp_path
+
+    def test_file_prefix_without_base_path_strips_only_known_extensions(self, tmp_path):
+        """Without base_path, only .md extension should be stripped, not version dots."""
+        f1 = tmp_path / "report_v2.1.5.md"
+        f1.write_text("# Report\n")
+
+        # BUG #266: Without base_path, Path(stem).with_suffix("") strips ".5"
+        parser = MarkdownStructureParser()  # No base_path!
+        doc = parser.parse_file(f1)
+
+        # The path should preserve the full version number
+        assert doc.sections[0].path == "report_v2.1.5"
+
+    def test_get_file_prefix_preserves_version_dots(self, tmp_path):
+        """_get_file_prefix must not strip version-like suffixes."""
+        parser = MarkdownStructureParser(base_path=tmp_path)
+        prefix = parser._get_file_prefix(tmp_path / "data_v3.2.1.md")
+        assert prefix == "data_v3.2.1"
+
+    def test_subdirectory_file_with_dots(self, tmp_path):
+        """Files with dots in subdirectories also get correct paths."""
+        sub = tmp_path / "reports"
+        sub.mkdir()
+        f1 = sub / "sprint_2.0.1.md"
+        f1.write_text("# Sprint 2.0.1\n\nNotes.\n")
+
+        parser = MarkdownStructureParser(base_path=tmp_path)
+        doc = parser.parse_file(f1)
+
+        assert doc.sections[0].path == "reports/sprint_2.0.1"
+
+
+class TestDottedFilenamesAsciiDoc:
+    """AsciiDoc files with dots in names must also have unique paths."""
+
+    def test_asciidoc_file_with_version_dots(self, tmp_path):
+        """AsciiDoc _get_file_prefix must preserve version dots."""
+        from dacli.asciidoc_parser import AsciidocStructureParser
+
+        f1 = tmp_path / "release_v1.2.3.adoc"
+        f1.write_text("= Release v1.2.3\n\nContent.\n")
+
+        parser = AsciidocStructureParser(base_path=tmp_path)
+        doc = parser.parse_file(f1)
+
+        assert doc.sections[0].path == "release_v1.2.3"
diff --git a/tests/test_validate_duplicate_warnings_268.py b/tests/test_validate_duplicate_warnings_268.py
@@ -0,0 +1,111 @@
+"""Tests for duplicate-path warnings in JSON validate output (Issue #268)."""
+
+from dacli.models import Document, Section, SourceLocation
+from dacli.services.validation_service import validate_structure
+from dacli.structure_index import StructureIndex
+
+
+class TestDuplicatePathWarningsInValidation:
+    """Duplicate-path warnings must appear in validate_structure JSON output."""
+
+    def test_duplicate_paths_appear_in_validation_warnings(self, tmp_path):
+        """When documents have duplicate section paths, validate reports them."""
+        doc1 = Document(
+            file_path=tmp_path / "a.md",
+            title="A",
+            sections=[
+                Section(
+                    title="Introduction",
+                    level=1,
+                    path="intro",
+                    source_location=SourceLocation(file=tmp_path / "a.md", line=1),
+                )
+            ],
+            elements=[],
+        )
+        doc2 = Document(
+            file_path=tmp_path / "b.md",
+            title="B",
+            sections=[
+                Section(
+                    title="Introduction",
+                    level=1,
+                    path="intro",  # Same path as doc1!
+                    source_location=SourceLocation(file=tmp_path / "b.md", line=1),
+                )
+            ],
+            elements=[],
+        )
+
+        index = StructureIndex()
+        build_warnings = index.build_from_documents([doc1, doc2])
+        assert len(build_warnings) > 0, "Should have duplicate path warnings"
+
+        result = validate_structure(index, tmp_path)
+        warning_types = [w["type"] for w in result["warnings"]]
+        assert "duplicate_path" in warning_types, (
+            f"duplicate_path not in warnings: {result['warnings']}"
+        )
+
+    def test_duplicate_path_warning_includes_details(self, tmp_path):
+        """Duplicate path warning includes path, files and line numbers."""
+        doc1 = Document(
+            file_path=tmp_path / "a.md",
+            title="A",
+            sections=[
+                Section(
+                    title="Setup",
+                    level=1,
+                    path="setup",
+                    source_location=SourceLocation(file=tmp_path / "a.md", line=5),
+                )
+            ],
+            elements=[],
+        )
+        doc2 = Document(
+            file_path=tmp_path / "b.md",
+            title="B",
+            sections=[
+                Section(
+                    title="Setup",
+                    level=1,
+                    path="setup",
+                    source_location=SourceLocation(file=tmp_path / "b.md", line=3),
+                )
+            ],
+            elements=[],
+        )
+
+        index = StructureIndex()
+        index.build_from_documents([doc1, doc2])
+
+        result = validate_structure(index, tmp_path)
+        dup_warnings = [w for w in result["warnings"] if w["type"] == "duplicate_path"]
+        assert len(dup_warnings) == 1
+
+        warning = dup_warnings[0]
+        assert warning["path"] == "setup"
+        assert "message" in warning
+
+    def test_no_duplicate_warnings_when_paths_unique(self, tmp_path):
+        """No duplicate_path warnings when all paths are unique."""
+        doc = Document(
+            file_path=tmp_path / "a.md",
+            title="A",
+            sections=[
+                Section(
+                    title="Intro",
+                    level=1,
+                    path="intro",
+                    source_location=SourceLocation(file=tmp_path / "a.md", line=1),
+                )
+            ],
+            elements=[],
+        )
+
+        index = StructureIndex()
+        index.build_from_documents([doc])
+
+        result = validate_structure(index, tmp_path)
+        dup_warnings = [w for w in result["warnings"] if w["type"] == "duplicate_path"]
+        assert len(dup_warnings) == 0
diff --git a/uv.lock b/uv.lock

Original file line number	Diff line number	Diff line change
`@@ -5,4 +5,4 @@`
`5`	`5`	`"""`
`6`	`6`
`7`	`7`
`8`		`-__version__ = "0.4.29"`
	`8`	`+__version__ = "0.4.30"`