Skip to content

Commit c1d2397

Browse files
committed
fix(docling): preserve metadata and harden Astra/read-file paths
1 parent d6f670a commit c1d2397

File tree

7 files changed

+143
-16
lines changed

7 files changed

+143
-16
lines changed

src/backend/tests/unit/components/files_and_knowledge/test_file_component.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,30 @@ def test_process_docling_subprocess_success(self, mock_subprocess):
143143
assert result.data["doc"] == mock_result["doc"]
144144
assert result.data["file_path"] == "test.pdf"
145145

146+
@patch("subprocess.run")
147+
def test_process_docling_subprocess_allows_ampersand_in_path(self, mock_subprocess):
148+
"""Test that valid file paths containing '&' are not rejected before subprocess execution."""
149+
component = FileComponent()
150+
component.markdown = False
151+
152+
mock_result = {
153+
"ok": True,
154+
"mode": "structured",
155+
"doc": [{"page_no": 1, "label": "paragraph", "text": "Content", "level": 0}],
156+
"meta": {"file_path": "/tmp/input.pdf"},
157+
}
158+
mock_subprocess.return_value = MagicMock(
159+
stdout=json.dumps(mock_result).encode("utf-8"),
160+
stderr=b"",
161+
)
162+
163+
input_path = "docs/R&D/report.pdf"
164+
result = component._process_docling_in_subprocess(input_path)
165+
166+
assert "error" not in result.data
167+
assert result.data["file_path"] == input_path
168+
mock_subprocess.assert_called_once()
169+
146170
def test_dynamic_outputs_have_tool_mode_enabled(self):
147171
"""Test that all dynamically created outputs have tool_mode=True."""
148172
component = FileComponent()

src/lfx/src/lfx/base/data/docling_utils.py

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -34,10 +34,31 @@ def extract_docling_documents(
3434
Returns:
3535
A tuple of (documents, warning_message) where warning_message is None if no warning
3636
37+
Raises:
38+
TypeError: If the data cannot be extracted or is invalid
39+
"""
40+
documents, _, warning_message = extract_docling_documents_with_metadata(data_inputs, doc_key)
41+
return documents, warning_message
42+
43+
44+
def extract_docling_documents_with_metadata(
45+
data_inputs: Data | list[Data] | DataFrame, doc_key: str
46+
) -> tuple[list[DoclingDocument], list[dict], str | None]:
47+
"""Extract DoclingDocument objects and aligned metadata from data inputs.
48+
49+
Args:
50+
data_inputs: The data inputs containing DoclingDocument objects
51+
doc_key: The key/column name to look for DoclingDocument objects
52+
53+
Returns:
54+
A tuple of (documents, metadata, warning_message) where warning_message is None if no warning.
55+
Metadata entries preserve all source fields except the DoclingDocument field itself.
56+
3757
Raises:
3858
TypeError: If the data cannot be extracted or is invalid
3959
"""
4060
documents: list[DoclingDocument] = []
61+
metadata: list[dict] = []
4162
warning_message: str | None = None
4263

4364
if isinstance(data_inputs, DataFrame):
@@ -46,6 +67,7 @@ def extract_docling_documents(
4667
raise TypeError(msg)
4768

4869
# Primary: Check for exact column name match
70+
source_column = doc_key
4971
if doc_key in data_inputs.columns:
5072
try:
5173
documents = data_inputs[doc_key].tolist()
@@ -73,6 +95,7 @@ def extract_docling_documents(
7395
logger.warning(warning_message)
7496
try:
7597
documents = data_inputs[found_column].tolist()
98+
source_column = found_column
7699
except Exception as e:
77100
msg = f"Error extracting DoclingDocument from DataFrame column '{found_column}': {e}"
78101
raise TypeError(msg) from e
@@ -88,36 +111,45 @@ def extract_docling_documents(
88111
f"3. If using VLM pipeline, try using the standard pipeline"
89112
)
90113
raise TypeError(msg)
114+
115+
for row in data_inputs.to_dict(orient="records"):
116+
row_doc = row.get(source_column)
117+
if isinstance(row_doc, DoclingDocument):
118+
metadata.append({k: v for k, v in row.items() if k != source_column})
91119
else:
92120
if not data_inputs:
93121
msg = "No data inputs provided"
94122
raise TypeError(msg)
95123

96124
if isinstance(data_inputs, Data):
97-
if doc_key not in data_inputs.data:
125+
if doc_key not in data_inputs.data or not isinstance(data_inputs.data[doc_key], DoclingDocument):
98126
msg = (
99127
f"'{doc_key}' field not available in the input Data. "
100128
"Check that your input is a DoclingDocument. "
101129
"You can use the Docling component to convert your input to a DoclingDocument."
102130
)
103131
raise TypeError(msg)
104132
documents = [data_inputs.data[doc_key]]
133+
metadata = [{k: v for k, v in data_inputs.data.items() if k != doc_key}]
105134
else:
106135
try:
107-
documents = [
108-
input_.data[doc_key]
109-
for input_ in data_inputs
110-
if isinstance(input_, Data)
111-
and doc_key in input_.data
112-
and isinstance(input_.data[doc_key], DoclingDocument)
113-
]
136+
documents = []
137+
metadata = []
138+
for input_ in data_inputs:
139+
if (
140+
isinstance(input_, Data)
141+
and doc_key in input_.data
142+
and isinstance(input_.data[doc_key], DoclingDocument)
143+
):
144+
documents.append(input_.data[doc_key])
145+
metadata.append({k: v for k, v in input_.data.items() if k != doc_key})
114146
if not documents:
115147
msg = f"No valid Data inputs found in {type(data_inputs)}"
116148
raise TypeError(msg)
117149
except AttributeError as e:
118150
msg = f"Invalid input type in collection: {e}"
119151
raise TypeError(msg) from e
120-
return documents, warning_message
152+
return documents, metadata, warning_message
121153

122154

123155
def _unwrap_secrets(obj):

src/lfx/src/lfx/components/datastax/astradb_vectorstore.py

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import math
2+
13
from astrapy import DataAPIClient
24
from langchain_core.documents import Document
35

@@ -409,7 +411,11 @@ def _add_documents_to_vector_store(self, vector_store) -> None:
409411
raise TypeError(msg)
410412

411413
documents = [
412-
Document(page_content=doc.page_content, metadata=serialize(doc.metadata, to_str=True)) for doc in documents
414+
Document(
415+
page_content=doc.page_content,
416+
metadata=serialize(self._sanitize_metadata(doc.metadata), to_str=True),
417+
)
418+
for doc in documents
413419
]
414420

415421
if documents and self.deletion_field:
@@ -434,6 +440,18 @@ def _add_documents_to_vector_store(self, vector_store) -> None:
434440
else:
435441
self.log("No documents to add to the Vector Store.")
436442

443+
@classmethod
444+
def _sanitize_metadata(cls, value):
445+
if isinstance(value, float) and not math.isfinite(value):
446+
return None
447+
if isinstance(value, dict):
448+
return {k: cls._sanitize_metadata(v) for k, v in value.items()}
449+
if isinstance(value, list):
450+
return [cls._sanitize_metadata(v) for v in value]
451+
if isinstance(value, tuple):
452+
return tuple(cls._sanitize_metadata(v) for v in value)
453+
return value
454+
437455
def _map_search_type(self) -> str:
438456
search_type_mapping = {
439457
"Similarity with score threshold": "similarity_score_threshold",

src/lfx/src/lfx/components/docling/export_docling_document.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
from docling_core.types.doc import ImageRefMode
44

5-
from lfx.base.data.docling_utils import extract_docling_documents
5+
from lfx.base.data.docling_utils import extract_docling_documents_with_metadata
66
from lfx.custom import Component
77
from lfx.io import DropdownInput, HandleInput, MessageTextInput, Output, StrInput
88
from lfx.schema import Data, DataFrame
@@ -86,14 +86,14 @@ def update_build_config(self, build_config: dict, field_value: Any, field_name:
8686
return build_config
8787

8888
def export_document(self) -> list[Data]:
89-
documents, warning = extract_docling_documents(self.data_inputs, self.doc_key)
89+
documents, metadata_list, warning = extract_docling_documents_with_metadata(self.data_inputs, self.doc_key)
9090
if warning:
9191
self.status = warning
9292

9393
results: list[Data] = []
9494
try:
9595
image_mode = ImageRefMode(self.image_mode)
96-
for doc in documents:
96+
for index, doc in enumerate(documents):
9797
content = ""
9898
if self.export_format == "Markdown":
9999
content = doc.export_to_markdown(
@@ -108,7 +108,8 @@ def export_document(self) -> list[Data]:
108108
elif self.export_format == "DocTags":
109109
content = doc.export_to_doctags()
110110

111-
results.append(Data(text=content))
111+
metadata = metadata_list[index] if index < len(metadata_list) else {}
112+
results.append(Data(text=content, data=metadata))
112113
except Exception as e:
113114
msg = f"Error splitting text: {e}"
114115
raise TypeError(msg) from e

src/lfx/src/lfx/components/files_and_knowledge/file.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,8 +1000,8 @@ def main():
10001000
"""
10011001
)
10021002

1003-
# Validate file_path to avoid command injection or unsafe input
1004-
if not isinstance(args["file_path"], str) or any(c in args["file_path"] for c in [";", "|", "&", "$", "`"]):
1003+
# Input goes through stdin (not shell); reject only clearly invalid path payloads.
1004+
if not isinstance(args["file_path"], str) or not args["file_path"] or "\x00" in args["file_path"]:
10051005
return Data(data={"error": "Unsafe file path detected.", "file_path": args["file_path"]})
10061006

10071007
proc = subprocess.run( # noqa: S603
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
from lfx.components.datastax.astradb_vectorstore import AstraDBVectorStoreComponent
2+
3+
4+
class TestAstraDBVectorStoreComponent:
5+
def test_sanitize_metadata_replaces_non_finite_floats(self):
6+
metadata = {
7+
"score": float("nan"),
8+
"nested": {"upper": float("inf"), "lower": float("-inf")},
9+
"items": [1.0, float("nan"), {"v": float("inf")}],
10+
}
11+
12+
sanitized = AstraDBVectorStoreComponent._sanitize_metadata(metadata)
13+
14+
assert sanitized["score"] is None
15+
assert sanitized["nested"]["upper"] is None
16+
assert sanitized["nested"]["lower"] is None
17+
assert sanitized["items"][1] is None
18+
assert sanitized["items"][2]["v"] is None
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import pytest
2+
3+
pytest.importorskip("docling_core")
4+
5+
from lfx.components.docling.export_docling_document import ExportDoclingDocumentComponent
6+
7+
8+
class _DummyDoc:
9+
def export_to_markdown(self, **_kwargs):
10+
return "exported markdown"
11+
12+
13+
class TestExportDoclingDocumentComponent:
14+
def test_export_document_preserves_input_metadata(self, monkeypatch):
15+
component = ExportDoclingDocumentComponent()
16+
component.export_format = "Markdown"
17+
component.image_mode = "placeholder"
18+
component.md_image_placeholder = "<!-- image -->"
19+
component.md_page_break_placeholder = ""
20+
component.doc_key = "doc"
21+
22+
metadata = {"file_path": "docs/report.pdf", "source": "docling-remote"}
23+
monkeypatch.setattr(
24+
"lfx.components.docling.export_docling_document.extract_docling_documents_with_metadata",
25+
lambda *_args, **_kwargs: ([_DummyDoc()], [metadata], None),
26+
)
27+
28+
result = component.export_document()
29+
30+
assert len(result) == 1
31+
assert result[0].text == "exported markdown"
32+
assert result[0].data["file_path"] == "docs/report.pdf"
33+
assert result[0].data["source"] == "docling-remote"
34+
assert "doc" not in result[0].data

0 commit comments

Comments
 (0)