diff --git a/pyproject.toml b/pyproject.toml index bfb9ce8..f8e7956 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "dacli" -version = "0.4.29" +version = "0.4.30" description = "Documentation Access CLI - Navigate and query large documentation projects" readme = "README.md" license = { text = "MIT" } diff --git a/src/dacli/__init__.py b/src/dacli/__init__.py index 0fc6e60..79ab174 100644 --- a/src/dacli/__init__.py +++ b/src/dacli/__init__.py @@ -5,4 +5,4 @@ """ -__version__ = "0.4.29" +__version__ = "0.4.30" diff --git a/src/dacli/asciidoc_parser.py b/src/dacli/asciidoc_parser.py index 868f707..97e1fe9 100644 --- a/src/dacli/asciidoc_parser.py +++ b/src/dacli/asciidoc_parser.py @@ -23,6 +23,7 @@ collect_all_sections, find_section_by_path, slugify, + strip_doc_extension, ) # Regex patterns from specification @@ -189,6 +190,9 @@ def _get_file_prefix(self, file_path: Path) -> str: The file prefix is the relative path from base_path to file_path, without the file extension. This ensures unique paths across documents. + Issue #266: Only strips known extensions (.md, .adoc) to preserve dots + in filenames (e.g. version numbers like "report_v1.2.3.adoc"). + Args: file_path: Path to the document being parsed @@ -198,10 +202,8 @@ def _get_file_prefix(self, file_path: Path) -> str: try: relative = file_path.relative_to(self.base_path) except ValueError: - # file_path is not relative to base_path, use just the stem - relative = Path(file_path.stem) - # Remove extension and convert to forward slashes - return str(relative.with_suffix("")).replace("\\", "/") + relative = Path(file_path.name) + return strip_doc_extension(relative) def get_section( self, doc: AsciidocDocument, path: str diff --git a/src/dacli/cli.py b/src/dacli/cli.py index cf32932..9736d26 100644 --- a/src/dacli/cli.py +++ b/src/dacli/cli.py @@ -284,7 +284,7 @@ def __init__( self.index = StructureIndex() self.file_handler = FileSystemHandler() self.asciidoc_parser = AsciidocStructureParser(base_path=docs_root) - self.markdown_parser = MarkdownStructureParser() + self.markdown_parser = MarkdownStructureParser(base_path=docs_root) # Build index _build_index( diff --git a/src/dacli/markdown_parser.py b/src/dacli/markdown_parser.py index b02a110..b3aea70 100644 --- a/src/dacli/markdown_parser.py +++ b/src/dacli/markdown_parser.py @@ -23,6 +23,7 @@ collect_all_sections, find_section_by_path, slugify, + strip_doc_extension, ) logger = logging.getLogger(__name__) @@ -118,6 +119,9 @@ def _get_file_prefix(self, file_path: Path) -> str: The file prefix is the relative path from base_path to file_path, without the file extension. This ensures unique paths across documents. + Issue #266: Only strips known extensions (.md, .adoc) to preserve dots + in filenames (e.g. version numbers like "report_v1.2.3.md"). + Args: file_path: Path to the document being parsed @@ -128,13 +132,10 @@ def _get_file_prefix(self, file_path: Path) -> str: try: relative = file_path.relative_to(self.base_path) except ValueError: - # file_path is not relative to base_path, use just the stem - relative = Path(file_path.stem) + relative = Path(file_path.name) else: - # No base_path provided, use just the stem - relative = Path(file_path.stem) - # Remove extension and convert to forward slashes - return str(relative.with_suffix("")).replace("\\", "/") + relative = Path(file_path.name) + return strip_doc_extension(relative) def parse_file(self, file_path: Path) -> MarkdownDocument: """Parse a single Markdown file. diff --git a/src/dacli/mcp_app.py b/src/dacli/mcp_app.py index bb3667d..d3a0e67 100644 --- a/src/dacli/mcp_app.py +++ b/src/dacli/mcp_app.py @@ -134,7 +134,7 @@ def create_mcp_server( index = StructureIndex() file_handler = FileSystemHandler() asciidoc_parser = AsciidocStructureParser(base_path=docs_root) - markdown_parser = MarkdownStructureParser() + markdown_parser = MarkdownStructureParser(base_path=docs_root) # Build initial index _build_index( diff --git a/src/dacli/parser_utils.py b/src/dacli/parser_utils.py index 64eff7d..3cce06d 100644 --- a/src/dacli/parser_utils.py +++ b/src/dacli/parser_utils.py @@ -5,9 +5,34 @@ """ import re +from pathlib import Path from dacli.models import Section +# Known document extensions to strip from file paths (Issue #266) +KNOWN_DOC_EXTENSIONS = {".md", ".adoc", ".asciidoc"} + + +def strip_doc_extension(file_path: Path) -> str: + """Remove only known document extensions from a file path. + + Unlike Path.with_suffix(""), this only removes known extensions (.md, .adoc, + .asciidoc) and preserves dots that are part of the filename (e.g. version + numbers like "report_v1.2.3.md" → "report_v1.2.3"). + + Args: + file_path: Path to strip extension from + + Returns: + String path with known extension removed, using forward slashes. + """ + path_str = str(file_path).replace("\\", "/") + suffix = file_path.suffix.lower() + if suffix in KNOWN_DOC_EXTENSIONS: + # Remove only the last suffix if it's a known doc extension + return path_str[: -len(file_path.suffix)] + return path_str + def slugify(text: str) -> str: """Convert text to URL-friendly slug. diff --git a/src/dacli/services/validation_service.py b/src/dacli/services/validation_service.py index 2e4f611..30883bd 100644 --- a/src/dacli/services/validation_service.py +++ b/src/dacli/services/validation_service.py @@ -90,6 +90,20 @@ def validate_structure(index: StructureIndex, docs_root: Path) -> dict: "message": pw.message, }) + # Issue #268: Include duplicate-path warnings from index build + for build_warning in index._build_warnings: + if "Duplicate section path" in build_warning: + # Parse the warning string to extract the path + # Format: "Duplicate section path: 'path' (first at file:line, duplicate at file:line)" + import re + match = re.search(r"Duplicate section path: '([^']+)'", build_warning) + dup_path = match.group(1) if match else "unknown" + warnings.append({ + "type": "duplicate_path", + "path": dup_path, + "message": build_warning, + }) + # Issue #219: Check for unresolved includes for doc in index._documents: # Only AsciiDoc documents have includes (check for attribute) diff --git a/src/dacli/structure_index.py b/src/dacli/structure_index.py index c98d369..db2e9e5 100644 --- a/src/dacli/structure_index.py +++ b/src/dacli/structure_index.py @@ -69,6 +69,7 @@ def __init__(self) -> None: self._documents: list[Document] = [] self._top_level_sections: list[Section] = [] self._circular_include_errors: list[dict] = [] + self._build_warnings: list[str] = [] # Issue #268: Store duplicate path warnings self._index_ready: bool = False def build_from_documents(self, documents: list[Document]) -> list[str]: @@ -99,6 +100,7 @@ def build_from_documents(self, documents: list[Document]) -> list[str]: for element in doc.elements: self._index_element(element) + self._build_warnings = warnings # Issue #268: Store for validation self._index_ready = True logger.info( f"Index built: {len(self._path_to_section)} sections, " @@ -494,6 +496,7 @@ def clear(self) -> None: self._documents.clear() self._top_level_sections.clear() self._circular_include_errors.clear() + self._build_warnings.clear() self._index_ready = False def stats(self) -> dict: diff --git a/tests/test_dotted_filenames_266.py b/tests/test_dotted_filenames_266.py new file mode 100644 index 0000000..5f70472 --- /dev/null +++ b/tests/test_dotted_filenames_266.py @@ -0,0 +1,89 @@ +"""Tests for dotted filenames producing unique paths (Issue #266).""" + +from dacli.markdown_parser import MarkdownStructureParser +from dacli.structure_index import StructureIndex + + +class TestDottedFilenames: + """Files with dots in names (e.g. version numbers) must have unique paths.""" + + def test_version_numbered_files_have_unique_paths(self, tmp_path): + """DACLI_TEST_RESULTS_v0.4.27.md and v0.4.28.md must not collide.""" + f1 = tmp_path / "RESULTS_v0.4.27.md" + f2 = tmp_path / "RESULTS_v0.4.28.md" + f1.write_text("# Results v0.4.27\n\nContent.\n") + f2.write_text("# Results v0.4.28\n\nContent.\n") + + parser = MarkdownStructureParser(base_path=tmp_path) + doc1 = parser.parse_file(f1) + doc2 = parser.parse_file(f2) + + index = StructureIndex() + warnings = index.build_from_documents([doc1, doc2]) + + assert len(warnings) == 0, f"Unexpected warnings: {warnings}" + + structure = index.get_structure() + paths = [s["path"] for s in structure["sections"]] + assert len(paths) == len(set(paths)), f"Duplicate paths found: {paths}" + + def test_cli_context_passes_base_path_to_markdown_parser(self, tmp_path): + """CliContext must pass docs_root as base_path to MarkdownStructureParser.""" + from dacli.cli import CliContext + + f1 = tmp_path / "test_v1.2.3.md" + f1.write_text("# Test v1.2.3\n") + + ctx = CliContext( + docs_root=tmp_path, + output_format="json", + pretty=False, + ) + # The markdown parser should have base_path set + assert ctx.markdown_parser.base_path == tmp_path + + def test_file_prefix_without_base_path_strips_only_known_extensions(self, tmp_path): + """Without base_path, only .md extension should be stripped, not version dots.""" + f1 = tmp_path / "report_v2.1.5.md" + f1.write_text("# Report\n") + + # BUG #266: Without base_path, Path(stem).with_suffix("") strips ".5" + parser = MarkdownStructureParser() # No base_path! + doc = parser.parse_file(f1) + + # The path should preserve the full version number + assert doc.sections[0].path == "report_v2.1.5" + + def test_get_file_prefix_preserves_version_dots(self, tmp_path): + """_get_file_prefix must not strip version-like suffixes.""" + parser = MarkdownStructureParser(base_path=tmp_path) + prefix = parser._get_file_prefix(tmp_path / "data_v3.2.1.md") + assert prefix == "data_v3.2.1" + + def test_subdirectory_file_with_dots(self, tmp_path): + """Files with dots in subdirectories also get correct paths.""" + sub = tmp_path / "reports" + sub.mkdir() + f1 = sub / "sprint_2.0.1.md" + f1.write_text("# Sprint 2.0.1\n\nNotes.\n") + + parser = MarkdownStructureParser(base_path=tmp_path) + doc = parser.parse_file(f1) + + assert doc.sections[0].path == "reports/sprint_2.0.1" + + +class TestDottedFilenamesAsciiDoc: + """AsciiDoc files with dots in names must also have unique paths.""" + + def test_asciidoc_file_with_version_dots(self, tmp_path): + """AsciiDoc _get_file_prefix must preserve version dots.""" + from dacli.asciidoc_parser import AsciidocStructureParser + + f1 = tmp_path / "release_v1.2.3.adoc" + f1.write_text("= Release v1.2.3\n\nContent.\n") + + parser = AsciidocStructureParser(base_path=tmp_path) + doc = parser.parse_file(f1) + + assert doc.sections[0].path == "release_v1.2.3" diff --git a/tests/test_validate_duplicate_warnings_268.py b/tests/test_validate_duplicate_warnings_268.py new file mode 100644 index 0000000..8b1c376 --- /dev/null +++ b/tests/test_validate_duplicate_warnings_268.py @@ -0,0 +1,111 @@ +"""Tests for duplicate-path warnings in JSON validate output (Issue #268).""" + +from dacli.models import Document, Section, SourceLocation +from dacli.services.validation_service import validate_structure +from dacli.structure_index import StructureIndex + + +class TestDuplicatePathWarningsInValidation: + """Duplicate-path warnings must appear in validate_structure JSON output.""" + + def test_duplicate_paths_appear_in_validation_warnings(self, tmp_path): + """When documents have duplicate section paths, validate reports them.""" + doc1 = Document( + file_path=tmp_path / "a.md", + title="A", + sections=[ + Section( + title="Introduction", + level=1, + path="intro", + source_location=SourceLocation(file=tmp_path / "a.md", line=1), + ) + ], + elements=[], + ) + doc2 = Document( + file_path=tmp_path / "b.md", + title="B", + sections=[ + Section( + title="Introduction", + level=1, + path="intro", # Same path as doc1! + source_location=SourceLocation(file=tmp_path / "b.md", line=1), + ) + ], + elements=[], + ) + + index = StructureIndex() + build_warnings = index.build_from_documents([doc1, doc2]) + assert len(build_warnings) > 0, "Should have duplicate path warnings" + + result = validate_structure(index, tmp_path) + warning_types = [w["type"] for w in result["warnings"]] + assert "duplicate_path" in warning_types, ( + f"duplicate_path not in warnings: {result['warnings']}" + ) + + def test_duplicate_path_warning_includes_details(self, tmp_path): + """Duplicate path warning includes path, files and line numbers.""" + doc1 = Document( + file_path=tmp_path / "a.md", + title="A", + sections=[ + Section( + title="Setup", + level=1, + path="setup", + source_location=SourceLocation(file=tmp_path / "a.md", line=5), + ) + ], + elements=[], + ) + doc2 = Document( + file_path=tmp_path / "b.md", + title="B", + sections=[ + Section( + title="Setup", + level=1, + path="setup", + source_location=SourceLocation(file=tmp_path / "b.md", line=3), + ) + ], + elements=[], + ) + + index = StructureIndex() + index.build_from_documents([doc1, doc2]) + + result = validate_structure(index, tmp_path) + dup_warnings = [w for w in result["warnings"] if w["type"] == "duplicate_path"] + assert len(dup_warnings) == 1 + + warning = dup_warnings[0] + assert warning["path"] == "setup" + assert "message" in warning + + def test_no_duplicate_warnings_when_paths_unique(self, tmp_path): + """No duplicate_path warnings when all paths are unique.""" + doc = Document( + file_path=tmp_path / "a.md", + title="A", + sections=[ + Section( + title="Intro", + level=1, + path="intro", + source_location=SourceLocation(file=tmp_path / "a.md", line=1), + ) + ], + elements=[], + ) + + index = StructureIndex() + index.build_from_documents([doc]) + + result = validate_structure(index, tmp_path) + dup_warnings = [w for w in result["warnings"] if w["type"] == "duplicate_path"] + assert len(dup_warnings) == 0 diff --git a/uv.lock b/uv.lock index 581da80..62bd822 100644 --- a/uv.lock +++ b/uv.lock @@ -372,7 +372,7 @@ wheels = [ [[package]] name = "dacli" -version = "0.4.29" +version = "0.4.30" source = { editable = "." } dependencies = [ { name = "click" },