feat(suggestions): return suggestions from metadata extraction and persist

yashlamba · yashlamba · commit da14ab801301 · 2026-03-16T16:09:06.000+01:00
diff --git a/app/activities/extract_metadata.py b/app/activities/extract_metadata.py
@@ -1,58 +1,77 @@
-"""Simple LLM-based metadata extraction activity."""
-
-import os
+"""LLM-based metadata suggestions activity."""
 
 from pydantic import BaseModel, Field
 from pydantic_ai import Agent
 from pydantic_ai.models.openai import OpenAIChatModel
 from pydantic_ai.providers.litellm import LiteLLMProvider
+from pydantic_ai.providers.ollama import OllamaProvider
 from temporalio import activity
 
+from app.config import get_settings
+from app.workflows.suggestions import MetadataResult
 
-class DocumentMetadata(BaseModel):
-    """Metadata extracted from a document."""
 
-    title: str = Field(description="Main title of the document")
-    abstract: str | None = Field(default=None, description="Document abstract")
-    authors: list[str] = Field(
-        default_factory=list, description="List of document authors"
-    )
+def _parse_llm(llm: str) -> tuple[str, str]:
+    provider, sep, model_name = llm.partition("/")
+    if not sep:
+        raise ValueError("Invalid LLM; expected '<provider>/<model>'")
+    provider = provider.strip().lower()
+    model_name = model_name.strip()
+    if provider not in {"litellm", "ollama"}:
+        raise ValueError("Invalid LLM; provider must be 'litellm' or 'ollama'")
+    if not model_name:
+        raise ValueError("Invalid LLM; model name is missing")
+    return provider, model_name
 
 
 class ExtractMetadataRequest(BaseModel):
-    """Request to extract metadata from document text."""
+    """Request to generate metadata suggestions from document text."""
 
     text: str = Field(description="Document text to analyze")
-    model: str = Field(default="groq/qwen/qwen3-32b", description="Model to use")
 
 
 INSTRUCTIONS = """\
-Extract structured metadata from this document text.
-Focus on finding the title, abstract/summary, and authors.
-For authors, extract individual names as separate list items.
-Only include information that is clearly stated in the text.
+You generate metadata suggestions from document text.
+
+Return a list of typed suggestions for the following fields:
+- title (string)
+- description (string; the abstract/summary)
+- creators (list of objects with: name, affiliation (optional), orcid (optional))
+
+Rules:
+- Only include information that is clearly stated in the text.
+- If a field is not present or cannot be determined, omit that suggestion entirely.
+- For creators.name, use the "Family, Given" format.
 """
 
 
-def _create_model(model_name: str):
-    """Create a model using LiteLLM provider."""
-    return OpenAIChatModel(
-        model_name=model_name,
-        provider=LiteLLMProvider(
-            api_base="https://llmgw-litellm.web.cern.ch/v1",
-            api_key=os.environ["LITELLM_API_KEY"],
-        ),
-    )
+def _create_model() -> OpenAIChatModel:
+    """Create an OpenAI-compatible chat model from settings."""
+    settings = get_settings()
+    provider_name, model_name = _parse_llm(settings.llm)
+
+    if provider_name == "ollama":
+        provider = OllamaProvider(
+            base_url=settings.ollama_base_url,
+            api_key=settings.ollama_api_key,
+        )
+    else:
+        provider = LiteLLMProvider(
+            api_base=settings.litellm_api_base,
+            api_key=settings.litellm_api_key,
+        )
+
+    return OpenAIChatModel(model_name=model_name, provider=provider)
 
 
 @activity.defn
-async def metadata_extraction(request: ExtractMetadataRequest) -> DocumentMetadata:
-    """Extract metadata using LLM."""
-    model = _create_model(request.model)
+async def metadata_extraction(request: ExtractMetadataRequest) -> MetadataResult:
+    """Generate typed metadata suggestions using an LLM."""
+    model = _create_model()
     agent = Agent(
         model=model,
         instructions=INSTRUCTIONS,
-        output_type=DocumentMetadata,
+        output_type=MetadataResult,
     )
 
     result = await agent.run(request.text)
diff --git a/app/activities/store_workflow_result.py b/app/activities/store_workflow_result.py
@@ -40,4 +40,3 @@ async def store_workflow_result(request: StoreWorkflowResultRequest) -> None:
         workflow.result = request.result
         session.add(workflow)
         session.commit()
-
diff --git a/app/config.py b/app/config.py
@@ -27,8 +27,8 @@ class Settings(BaseSettings):
     allowed_origins: list[str] = ["http://localhost:3000", "http://127.0.0.1:3000"]
 
     # LLM
-    # TODO Currently we have only a single workflow, so single LLM configuration if fine,
-    # I guess we can parameterize it or make it configurable per workflow.
+    # TODO Currently we have only a single workflow, so single LLM configuration
+    # is fine. We can parameterize it or make it configurable per workflow later.
     llm: str = "ollama/qwen3:4b"
     litellm_api_base: str = "<litellm-endpoint>"
     litellm_api_key: str | None = None
diff --git a/app/routers/workflows.py b/app/routers/workflows.py
@@ -100,12 +100,18 @@ async def create_workflow(
         client = _get_temporal_client(request)
         await client.start_workflow(
             ExtractMetadata.run,
-            args=[{"url": body.url, "extractor": body.extractor, "pages": body.pages}],
+            args=[
+                {
+                    "workflow_id": workflow_id,
+                    "tenant_id": auth.tenant_id,
+                    "url": body.url,
+                    "extractor": body.extractor,
+                    "pages": body.pages,
+                }
+            ],
             id=f"extract-metadata-{workflow_id}",
             task_queue="extract-pdf-metadata-task-queue",
         )
-        workflow.status = WorkflowStatus.SUCCESS
-        session.commit()
     except Exception as e:
         print("Error(start_temporal_workflow)", e)
         try:
@@ -117,7 +123,7 @@ async def create_workflow(
             status_code=500, detail="Could not start extraction workflow"
         )
 
-    return {"public_id": workflow_id, "status": "PROCESSING"}
+    return workflow.to_dict()
 
 
 @router.get(
diff --git a/app/workflows/extract_metadata_workflow.py b/app/workflows/extract_metadata_workflow.py
@@ -8,93 +8,72 @@
 
 from app.activities.extract_metadata import ExtractMetadataRequest, metadata_extraction
 from app.activities.extract_pdf_content import ExtractPdfContentRequest, text_extraction
+from app.activities.store_workflow_result import (
+    StoreWorkflowResultRequest,
+    store_workflow_result,
+)
+from app.database.models import WorkflowStatus
+from app.workflows.suggestions import MetadataResult
 
 
-class DocumentMetadata(BaseModel):
-    """Structured metadata extracted from a PDF document."""
-
-    title: str | None = Field(default=None, description="The title of the document")
-    authors: list[str] | None = Field(
-        default=None, description="List of document authors"
-    )
-    publication_date: str | None = Field(
-        default=None,
-        description="Publication date in ISO format (YYYY-MM-DD, YYYY-MM, or YYYY)",
-    )
-    abstract: str | None = Field(
-        default=None,
-        description="Abstract or summary of the document, extracted verbatim",
-    )
-    language: str | None = Field(
-        default=None, description="Language of the document (e.g. 'en', 'fr')"
-    )
-    keywords: list[str] | None = Field(
-        default=None, description="Key topics or keywords from the document"
-    )
-
-
-METADATA_INSTRUCTIONS = """\
-You are an expert at extracting structured metadata from documents.
-
-Given the raw text content of a PDF document, extract the following metadata fields:
-- title: The main title of the document.
-- authors: A list of authors. Look for names near the title,
-  in headers, or in an authors section.
-- publication_date: The publication date in ISO format (YYYY-MM-DD, YYYY-MM, or YYYY).
-- abstract: The abstract or summary, extracted verbatim from the document.
-- language: The language the document is written in (ISO 639-1 code, e.g. "en").
-- keywords: Key topics or keywords mentioned in the document.
-
-IMPORTANT RULES:
-1. Only include information explicitly stated in the document.
-2. If a field is not present or cannot be determined, leave it as null.
-3. For the abstract, include the text verbatim from the document.
-4. Do not fabricate or infer information that is not in the text.
-"""
+class ExtractMetadataWorkflowRequest(BaseModel):
+    """Workflow request to extract PDF content and generate metadata suggestions."""
 
-# metadata_agent = Agent(
-#     "openai:gpt-4o-mini",
-#     instructions=METADATA_INSTRUCTIONS,
-#     output_type=DocumentMetadata,
-#     name="metadata_extractor",
-# )
-#
-# temporal_metadata_agent = TemporalAgent(
-#     metadata_agent,
-#     model_activity_config=workflow.ActivityConfig(
-#         start_to_close_timeout=timedelta(minutes=5),
-#     ),
-# )
-#
+    workflow_id: str = Field(description="Workflow public_id (DB primary identifier)")
+    tenant_id: str = Field(description="Tenant id (ownership check)")
+    url: str
+    extractor: str = "pdfplumber"
+    pages: list[int] | None = None
 
 
 @workflow.defn
 class ExtractMetadata(PydanticAIWorkflow):
     """Workflow that extracts content from a PDF and uses an LLM to extract metadata."""
 
     @workflow.run
-    async def run(self, request_data: dict) -> DocumentMetadata:
-        """Execute the metadata extraction workflow.
-
-        Args:
-            request_data: Dictionary containing PDF extraction parameters
-                (url, extractor, pages).
-
-        Returns:
-            DocumentMetadata: Extracted metadata from the PDF document.
-        """
-        # Activity 1: Extract PDF text
-        content = await workflow.execute_activity(
-            text_extraction,
-            ExtractPdfContentRequest(**request_data),
-            start_to_close_timeout=timedelta(minutes=5),
-        )
-
-        # Activity 2: Extract metadata using LLM
-        metadata = await workflow.execute_activity(
-            metadata_extraction,
-            ExtractMetadataRequest(text=content.text),
-            start_to_close_timeout=timedelta(minutes=5),
+    async def run(self, request_data: dict) -> MetadataResult:
+        """Execute the extraction + suggestions workflow."""
+        request = ExtractMetadataWorkflowRequest(**request_data)
+        try:
+            # Activity 1: Extract PDF text
+            content = await workflow.execute_activity(
+                text_extraction,
+                ExtractPdfContentRequest(
+                    url=request.url,
+                    extractor=request.extractor,
+                    pages=request.pages,
+                ),
+                start_to_close_timeout=timedelta(minutes=5),
+            )
+
+            # Activity 2: Generate metadata suggestions using LLM
+            result = await workflow.execute_activity(
+                metadata_extraction,
+                ExtractMetadataRequest(text=content.text),
+                start_to_close_timeout=timedelta(minutes=5),
+            )
+        except Exception:
+            await workflow.execute_activity(
+                store_workflow_result,
+                StoreWorkflowResultRequest(
+                    workflow_id=request.workflow_id,
+                    tenant_id=request.tenant_id,
+                    status=WorkflowStatus.ERROR,
+                    result=None,
+                ),
+                start_to_close_timeout=timedelta(minutes=1),
+            )
+            raise
+
+        await workflow.execute_activity(
+            store_workflow_result,
+            StoreWorkflowResultRequest(
+                workflow_id=request.workflow_id,
+                tenant_id=request.tenant_id,
+                status=WorkflowStatus.SUCCESS,
+                result=result.model_dump(),
+            ),
+            start_to_close_timeout=timedelta(minutes=1),
         )
 
-        return metadata
+        return result
diff --git a/app/workflows/suggestions.py b/app/workflows/suggestions.py
@@ -0,0 +1,70 @@
+"""Typed metadata suggestions returned by the workflow."""
+
+# from __future__ import annotations
+
+from typing import Annotated, Literal
+
+from pydantic import BaseModel, Field, field_validator
+
+
+class Creator(BaseModel):
+    """A structured creator/author."""
+
+    name: str  # "Family, Given"
+    affiliation: str | None = None
+    orcid: str | None = None
+
+    @field_validator("name")
+    @classmethod
+    def normalize_name(cls, v: str) -> str:
+        """Normalize names to the canonical 'Family, Given' format."""
+        cleaned = " ".join(v.split()).strip()
+        if not cleaned:
+            return cleaned
+        if "," in cleaned:
+            return cleaned
+        parts = cleaned.split(" ")
+        if len(parts) == 1:
+            return f"{parts[0]},"
+        family = parts[-1]
+        given = " ".join(parts[:-1])
+        return f"{family}, {given}"
+
+
+class TitleSuggestion(BaseModel):
+    """Suggestion for `title`."""
+
+    field: Literal["title"] = "title"
+    value: str
+
+
+class DescriptionSuggestion(BaseModel):
+    """Suggestion for `description` (abstract)."""
+
+    field: Literal["description"] = "description"
+    value: str
+
+
+class CreatorsSuggestion(BaseModel):
+    """Suggestion for `creators`."""
+
+    field: Literal["creators"] = "creators"
+    value: list[Creator]
+
+    @field_validator("value")
+    @classmethod
+    def filter_empty_names(cls, v: list[Creator]) -> list[Creator]:
+        """Filter out creators with empty names."""
+        return [c for c in v if c.name]
+
+
+MetadataSuggestion = Annotated[
+    TitleSuggestion | DescriptionSuggestion | CreatorsSuggestion,
+    Field(discriminator="field"),
+]
+
+
+class MetadataResult(BaseModel):
+    """Container for all metadata suggestions from a workflow run."""
+
+    suggestions: list[MetadataSuggestion]
diff --git a/tests/test_auth.py b/tests/test_auth.py