feat(suggestions): return suggestions from metadata extraction and persist

yashlamba · yashlamba · commit d480dab377fe · 2026-03-16T13:46:42.000+01:00
diff --git a/app/activities/extract_metadata.py b/app/activities/extract_metadata.py
@@ -1,58 +1,76 @@
-"""Simple LLM-based metadata extraction activity."""
-
-import os
+"""LLM-based metadata suggestions activity."""
 
 from pydantic import BaseModel, Field
 from pydantic_ai import Agent
 from pydantic_ai.models.openai import OpenAIChatModel
 from pydantic_ai.providers.litellm import LiteLLMProvider
+from pydantic_ai.providers.ollama import OllamaProvider
 from temporalio import activity
 
+from app.config import get_settings
+from app.workflows.suggestions import MetadataResult
 
-class DocumentMetadata(BaseModel):
-    """Metadata extracted from a document."""
-
-    title: str = Field(description="Main title of the document")
-    abstract: str | None = Field(default=None, description="Document abstract")
-    authors: list[str] = Field(
-        default_factory=list, description="List of document authors"
-    )
+def _parse_llm(llm: str) -> tuple[str, str]:
+    provider, sep, model_name = llm.partition("/")
+    if not sep:
+        raise ValueError("Invalid LLM; expected '<provider>/<model>'")
+    provider = provider.strip().lower()
+    model_name = model_name.strip()
+    if provider not in {"litellm", "ollama"}:
+        raise ValueError("Invalid LLM; provider must be 'litellm' or 'ollama'")
+    if not model_name:
+        raise ValueError("Invalid LLM; model name is missing")
+    return provider, model_name
 
 
 class ExtractMetadataRequest(BaseModel):
-    """Request to extract metadata from document text."""
+    """Request to generate metadata suggestions from document text."""
 
     text: str = Field(description="Document text to analyze")
-    model: str = Field(default="groq/qwen/qwen3-32b", description="Model to use")
 
 
 INSTRUCTIONS = """\
-Extract structured metadata from this document text.
-Focus on finding the title, abstract/summary, and authors.
-For authors, extract individual names as separate list items.
-Only include information that is clearly stated in the text.
+You generate metadata suggestions from document text.
+
+Return a list of typed suggestions for the following fields:
+- title (string)
+- description (string; the abstract/summary)
+- creators (list of objects with: name, affiliation (optional), orcid (optional))
+
+Rules:
+- Only include information that is clearly stated in the text.
+- If a field is not present or cannot be determined, omit that suggestion entirely.
+- For creators.name, use the "Family, Given" format.
 """
 
 
-def _create_model(model_name: str):
-    """Create a model using LiteLLM provider."""
-    return OpenAIChatModel(
-        model_name=model_name,
-        provider=LiteLLMProvider(
-            api_base="https://llmgw-litellm.web.cern.ch/v1",
-            api_key=os.environ["LITELLM_API_KEY"],
-        ),
-    )
+def _create_model() -> OpenAIChatModel:
+    """Create an OpenAI-compatible chat model from settings."""
+    settings = get_settings()
+    provider_name, model_name = _parse_llm(settings.llm)
+
+    if provider_name == "ollama":
+        provider = OllamaProvider(
+            base_url=settings.ollama_base_url,
+            api_key=settings.ollama_api_key,
+        )
+    else:
+        provider = LiteLLMProvider(
+            api_base=settings.litellm_api_base,
+            api_key=settings.litellm_api_key,
+        )
+
+    return OpenAIChatModel(model_name=model_name, provider=provider)
 
 
 @activity.defn
-async def metadata_extraction(request: ExtractMetadataRequest) -> DocumentMetadata:
-    """Extract metadata using LLM."""
-    model = _create_model(request.model)
+async def metadata_extraction(request: ExtractMetadataRequest) -> MetadataResult:
+    """Generate typed metadata suggestions using an LLM."""
+    model = _create_model()
     agent = Agent(
         model=model,
         instructions=INSTRUCTIONS,
-        output_type=DocumentMetadata,
+        output_type=MetadataResult,
     )
 
     result = await agent.run(request.text)
diff --git a/app/routers/workflows.py b/app/routers/workflows.py
@@ -100,12 +100,18 @@ async def create_workflow(
         client = _get_temporal_client(request)
         await client.start_workflow(
             ExtractMetadata.run,
-            args=[{"url": body.url, "extractor": body.extractor, "pages": body.pages}],
+            args=[
+                {
+                    "workflow_id": workflow_id,
+                    "tenant_id": auth.tenant_id,
+                    "url": body.url,
+                    "extractor": body.extractor,
+                    "pages": body.pages,
+                }
+            ],
             id=f"extract-metadata-{workflow_id}",
             task_queue="extract-pdf-metadata-task-queue",
         )
-        workflow.status = WorkflowStatus.SUCCESS
-        session.commit()
     except Exception as e:
         print("Error(start_temporal_workflow)", e)
         try:
@@ -117,7 +123,7 @@ async def create_workflow(
             status_code=500, detail="Could not start extraction workflow"
         )
 
-    return {"public_id": workflow_id, "status": "PROCESSING"}
+    return workflow.to_dict()
 
 
 @router.get(
diff --git a/app/workflows/extract_metadata_workflow.py b/app/workflows/extract_metadata_workflow.py
@@ -8,93 +8,69 @@
 
 from app.activities.extract_metadata import ExtractMetadataRequest, metadata_extraction
 from app.activities.extract_pdf_content import ExtractPdfContentRequest, text_extraction
+from app.activities.store_workflow_result import StoreWorkflowResultRequest, store_workflow_result
+from app.database.models import WorkflowStatus
+from app.workflows.suggestions import MetadataResult
 
 
-class DocumentMetadata(BaseModel):
-    """Structured metadata extracted from a PDF document."""
+class ExtractMetadataWorkflowRequest(BaseModel):
+    """Workflow request to extract PDF content and generate metadata suggestions."""
 
-    title: str | None = Field(default=None, description="The title of the document")
-    authors: list[str] | None = Field(
-        default=None, description="List of document authors"
-    )
-    publication_date: str | None = Field(
-        default=None,
-        description="Publication date in ISO format (YYYY-MM-DD, YYYY-MM, or YYYY)",
-    )
-    abstract: str | None = Field(
-        default=None,
-        description="Abstract or summary of the document, extracted verbatim",
-    )
-    language: str | None = Field(
-        default=None, description="Language of the document (e.g. 'en', 'fr')"
-    )
-    keywords: list[str] | None = Field(
-        default=None, description="Key topics or keywords from the document"
-    )
-
-
-METADATA_INSTRUCTIONS = """\
-You are an expert at extracting structured metadata from documents.
-
-Given the raw text content of a PDF document, extract the following metadata fields:
-- title: The main title of the document.
-- authors: A list of authors. Look for names near the title,
-  in headers, or in an authors section.
-- publication_date: The publication date in ISO format (YYYY-MM-DD, YYYY-MM, or YYYY).
-- abstract: The abstract or summary, extracted verbatim from the document.
-- language: The language the document is written in (ISO 639-1 code, e.g. "en").
-- keywords: Key topics or keywords mentioned in the document.
-
-IMPORTANT RULES:
-1. Only include information explicitly stated in the document.
-2. If a field is not present or cannot be determined, leave it as null.
-3. For the abstract, include the text verbatim from the document.
-4. Do not fabricate or infer information that is not in the text.
-"""
-
-# metadata_agent = Agent(
-#     "openai:gpt-4o-mini",
-#     instructions=METADATA_INSTRUCTIONS,
-#     output_type=DocumentMetadata,
-#     name="metadata_extractor",
-# )
-#
-# temporal_metadata_agent = TemporalAgent(
-#     metadata_agent,
-#     model_activity_config=workflow.ActivityConfig(
-#         start_to_close_timeout=timedelta(minutes=5),
-#     ),
-# )
-#
+    workflow_id: str = Field(description="Workflow public_id (DB primary identifier)")
+    tenant_id: str = Field(description="Tenant id (ownership check)")
+    url: str
+    extractor: str = "pdfplumber"
+    pages: list[int] | None = None
 
 
 @workflow.defn
 class ExtractMetadata(PydanticAIWorkflow):
     """Workflow that extracts content from a PDF and uses an LLM to extract metadata."""
 
     @workflow.run
-    async def run(self, request_data: dict) -> DocumentMetadata:
-        """Execute the metadata extraction workflow.
-
-        Args:
-            request_data: Dictionary containing PDF extraction parameters
-                (url, extractor, pages).
-
-        Returns:
-            DocumentMetadata: Extracted metadata from the PDF document.
-        """
-        # Activity 1: Extract PDF text
-        content = await workflow.execute_activity(
-            text_extraction,
-            ExtractPdfContentRequest(**request_data),
-            start_to_close_timeout=timedelta(minutes=5),
-        )
-
-        # Activity 2: Extract metadata using LLM
-        metadata = await workflow.execute_activity(
-            metadata_extraction,
-            ExtractMetadataRequest(text=content.text),
-            start_to_close_timeout=timedelta(minutes=5),
+    async def run(self, request_data: dict) -> MetadataResult:
+        """Execute the extraction + suggestions workflow."""
+        request = ExtractMetadataWorkflowRequest(**request_data)
+        try:
+            # Activity 1: Extract PDF text
+            content = await workflow.execute_activity(
+                text_extraction,
+                ExtractPdfContentRequest(
+                    url=request.url,
+                    extractor=request.extractor,
+                    pages=request.pages,
+                ),
+                start_to_close_timeout=timedelta(minutes=5),
+            )
+
+            # Activity 2: Generate metadata suggestions using LLM
+            result = await workflow.execute_activity(
+                metadata_extraction,
+                ExtractMetadataRequest(text=content.text),
+                start_to_close_timeout=timedelta(minutes=5),
+            )
+        except Exception:
+            await workflow.execute_activity(
+                store_workflow_result,
+                StoreWorkflowResultRequest(
+                    workflow_id=request.workflow_id,
+                    tenant_id=request.tenant_id,
+                    status=WorkflowStatus.ERROR,
+                    result=None,
+                ),
+                start_to_close_timeout=timedelta(minutes=1),
+            )
+            raise
+
+        await workflow.execute_activity(
+            store_workflow_result,
+            StoreWorkflowResultRequest(
+                workflow_id=request.workflow_id,
+                tenant_id=request.tenant_id,
+                status=WorkflowStatus.SUCCESS,
+                result=result.model_dump(),
+            ),
+            start_to_close_timeout=timedelta(minutes=1),
         )
 
-        return metadata
+        return result
diff --git a/app/workflows/suggestions.py b/app/workflows/suggestions.py
@@ -0,0 +1,69 @@
+"""Typed metadata suggestions returned by the workflow."""
+
+# from __future__ import annotations
+
+from typing import Annotated, Literal
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class Creator(BaseModel):
+    """A structured creator/author."""
+
+    name: str  # "Family, Given"
+    affiliation: str | None = None
+    orcid: str | None = None
+
+    @field_validator("name")
+    @classmethod
+    def normalize_name(cls, v: str) -> str:
+        """Normalize names to the canonical 'Family, Given' format."""
+        cleaned = " ".join(v.split()).strip()
+        if not cleaned:
+            return cleaned
+        if "," in cleaned:
+            return cleaned
+        parts = cleaned.split(" ")
+        if len(parts) == 1:
+            return f"{parts[0]},"
+        family = parts[-1]
+        given = " ".join(parts[:-1])
+        return f"{family}, {given}"
+
+
+class TitleSuggestion(BaseModel):
+    """Suggestion for `title`."""
+
+    field: Literal["title"] = "title"
+    value: str
+
+
+class DescriptionSuggestion(BaseModel):
+    """Suggestion for `description` (abstract)."""
+
+    field: Literal["description"] = "description"
+    value: str
+
+
+class CreatorsSuggestion(BaseModel):
+    """Suggestion for `creators`."""
+
+    field: Literal["creators"] = "creators"
+    value: list[Creator]
+
+    @field_validator("value")
+    @classmethod
+    def filter_empty_names(cls, v: list[Creator]) -> list[Creator]:
+        return [c for c in v if c.name]
+
+
+MetadataSuggestion = Annotated[
+    TitleSuggestion | DescriptionSuggestion | CreatorsSuggestion,
+    Field(discriminator="field"),
+]
+
+
+class MetadataResult(BaseModel):
+    """Container for all metadata suggestions from a workflow run."""
+
+    suggestions: list[MetadataSuggestion]
diff --git a/tests/test_auth.py b/tests/test_auth.py
@@ -503,3 +503,26 @@ def test_list_workflows_tenant_isolation(client, db_session):
     assert len(workflows) == 1
     assert workflows[0]["tenant_id"] == "tenant-b"
     assert workflows[0]["public_id"] == wf_b.public_id
+
+
+def test_read_workflow_includes_result(client, db_session):
+    """GET /workflows/{id} includes `result.suggestions` when present."""
+    wf = Workflow(
+        status=WorkflowStatus.SUCCESS,
+        url="https://example.com/test.pdf",
+        tenant_id="tenant-a",
+        result={"suggestions": [{"field": "title", "value": "My Title"}]},
+    )
+    db_session.add(wf)
+    db_session.commit()
+    db_session.refresh(wf)
+
+    token = generate_test_token()
+    response = client.get(
+        f"/workflows/{wf.public_id}",
+        headers={"Authorization": f"Bearer {token}"},
+    )
+    assert response.status_code == 200
+    payload = response.json()
+    assert "result" in payload
+    assert payload["result"]["suggestions"][0]["field"] == "title"