Skip to content

Commit da14ab8

Browse files
committed
feat(suggestions): return suggestions from metadata extraction and persist
1 parent 33bf36c commit da14ab8

File tree

7 files changed

+210
-114
lines changed

7 files changed

+210
-114
lines changed

app/activities/extract_metadata.py

Lines changed: 48 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,77 @@
1-
"""Simple LLM-based metadata extraction activity."""
2-
3-
import os
1+
"""LLM-based metadata suggestions activity."""
42

53
from pydantic import BaseModel, Field
64
from pydantic_ai import Agent
75
from pydantic_ai.models.openai import OpenAIChatModel
86
from pydantic_ai.providers.litellm import LiteLLMProvider
7+
from pydantic_ai.providers.ollama import OllamaProvider
98
from temporalio import activity
109

10+
from app.config import get_settings
11+
from app.workflows.suggestions import MetadataResult
1112

12-
class DocumentMetadata(BaseModel):
13-
"""Metadata extracted from a document."""
1413

15-
title: str = Field(description="Main title of the document")
16-
abstract: str | None = Field(default=None, description="Document abstract")
17-
authors: list[str] = Field(
18-
default_factory=list, description="List of document authors"
19-
)
14+
def _parse_llm(llm: str) -> tuple[str, str]:
15+
provider, sep, model_name = llm.partition("/")
16+
if not sep:
17+
raise ValueError("Invalid LLM; expected '<provider>/<model>'")
18+
provider = provider.strip().lower()
19+
model_name = model_name.strip()
20+
if provider not in {"litellm", "ollama"}:
21+
raise ValueError("Invalid LLM; provider must be 'litellm' or 'ollama'")
22+
if not model_name:
23+
raise ValueError("Invalid LLM; model name is missing")
24+
return provider, model_name
2025

2126

2227
class ExtractMetadataRequest(BaseModel):
23-
"""Request to extract metadata from document text."""
28+
"""Request to generate metadata suggestions from document text."""
2429

2530
text: str = Field(description="Document text to analyze")
26-
model: str = Field(default="groq/qwen/qwen3-32b", description="Model to use")
2731

2832

2933
INSTRUCTIONS = """\
30-
Extract structured metadata from this document text.
31-
Focus on finding the title, abstract/summary, and authors.
32-
For authors, extract individual names as separate list items.
33-
Only include information that is clearly stated in the text.
34+
You generate metadata suggestions from document text.
35+
36+
Return a list of typed suggestions for the following fields:
37+
- title (string)
38+
- description (string; the abstract/summary)
39+
- creators (list of objects with: name, affiliation (optional), orcid (optional))
40+
41+
Rules:
42+
- Only include information that is clearly stated in the text.
43+
- If a field is not present or cannot be determined, omit that suggestion entirely.
44+
- For creators.name, use the "Family, Given" format.
3445
"""
3546

3647

37-
def _create_model(model_name: str):
38-
"""Create a model using LiteLLM provider."""
39-
return OpenAIChatModel(
40-
model_name=model_name,
41-
provider=LiteLLMProvider(
42-
api_base="https://llmgw-litellm.web.cern.ch/v1",
43-
api_key=os.environ["LITELLM_API_KEY"],
44-
),
45-
)
48+
def _create_model() -> OpenAIChatModel:
49+
"""Create an OpenAI-compatible chat model from settings."""
50+
settings = get_settings()
51+
provider_name, model_name = _parse_llm(settings.llm)
52+
53+
if provider_name == "ollama":
54+
provider = OllamaProvider(
55+
base_url=settings.ollama_base_url,
56+
api_key=settings.ollama_api_key,
57+
)
58+
else:
59+
provider = LiteLLMProvider(
60+
api_base=settings.litellm_api_base,
61+
api_key=settings.litellm_api_key,
62+
)
63+
64+
return OpenAIChatModel(model_name=model_name, provider=provider)
4665

4766

4867
@activity.defn
49-
async def metadata_extraction(request: ExtractMetadataRequest) -> DocumentMetadata:
50-
"""Extract metadata using LLM."""
51-
model = _create_model(request.model)
68+
async def metadata_extraction(request: ExtractMetadataRequest) -> MetadataResult:
69+
"""Generate typed metadata suggestions using an LLM."""
70+
model = _create_model()
5271
agent = Agent(
5372
model=model,
5473
instructions=INSTRUCTIONS,
55-
output_type=DocumentMetadata,
74+
output_type=MetadataResult,
5675
)
5776

5877
result = await agent.run(request.text)

app/activities/store_workflow_result.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,3 @@ async def store_workflow_result(request: StoreWorkflowResultRequest) -> None:
4040
workflow.result = request.result
4141
session.add(workflow)
4242
session.commit()
43-

app/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@ class Settings(BaseSettings):
2727
allowed_origins: list[str] = ["http://localhost:3000", "http://127.0.0.1:3000"]
2828

2929
# LLM
30-
# TODO Currently we have only a single workflow, so single LLM configuration if fine,
31-
# I guess we can parameterize it or make it configurable per workflow.
30+
# TODO Currently we have only a single workflow, so single LLM configuration
31+
# is fine. We can parameterize it or make it configurable per workflow later.
3232
llm: str = "ollama/qwen3:4b"
3333
litellm_api_base: str = "<litellm-endpoint>"
3434
litellm_api_key: str | None = None

app/routers/workflows.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,18 @@ async def create_workflow(
100100
client = _get_temporal_client(request)
101101
await client.start_workflow(
102102
ExtractMetadata.run,
103-
args=[{"url": body.url, "extractor": body.extractor, "pages": body.pages}],
103+
args=[
104+
{
105+
"workflow_id": workflow_id,
106+
"tenant_id": auth.tenant_id,
107+
"url": body.url,
108+
"extractor": body.extractor,
109+
"pages": body.pages,
110+
}
111+
],
104112
id=f"extract-metadata-{workflow_id}",
105113
task_queue="extract-pdf-metadata-task-queue",
106114
)
107-
workflow.status = WorkflowStatus.SUCCESS
108-
session.commit()
109115
except Exception as e:
110116
print("Error(start_temporal_workflow)", e)
111117
try:
@@ -117,7 +123,7 @@ async def create_workflow(
117123
status_code=500, detail="Could not start extraction workflow"
118124
)
119125

120-
return {"public_id": workflow_id, "status": "PROCESSING"}
126+
return workflow.to_dict()
121127

122128

123129
@router.get(

app/workflows/extract_metadata_workflow.py

Lines changed: 57 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -8,93 +8,72 @@
88

99
from app.activities.extract_metadata import ExtractMetadataRequest, metadata_extraction
1010
from app.activities.extract_pdf_content import ExtractPdfContentRequest, text_extraction
11+
from app.activities.store_workflow_result import (
12+
StoreWorkflowResultRequest,
13+
store_workflow_result,
14+
)
15+
from app.database.models import WorkflowStatus
16+
from app.workflows.suggestions import MetadataResult
1117

1218

13-
class DocumentMetadata(BaseModel):
14-
"""Structured metadata extracted from a PDF document."""
15-
16-
title: str | None = Field(default=None, description="The title of the document")
17-
authors: list[str] | None = Field(
18-
default=None, description="List of document authors"
19-
)
20-
publication_date: str | None = Field(
21-
default=None,
22-
description="Publication date in ISO format (YYYY-MM-DD, YYYY-MM, or YYYY)",
23-
)
24-
abstract: str | None = Field(
25-
default=None,
26-
description="Abstract or summary of the document, extracted verbatim",
27-
)
28-
language: str | None = Field(
29-
default=None, description="Language of the document (e.g. 'en', 'fr')"
30-
)
31-
keywords: list[str] | None = Field(
32-
default=None, description="Key topics or keywords from the document"
33-
)
34-
35-
36-
METADATA_INSTRUCTIONS = """\
37-
You are an expert at extracting structured metadata from documents.
38-
39-
Given the raw text content of a PDF document, extract the following metadata fields:
40-
- title: The main title of the document.
41-
- authors: A list of authors. Look for names near the title,
42-
in headers, or in an authors section.
43-
- publication_date: The publication date in ISO format (YYYY-MM-DD, YYYY-MM, or YYYY).
44-
- abstract: The abstract or summary, extracted verbatim from the document.
45-
- language: The language the document is written in (ISO 639-1 code, e.g. "en").
46-
- keywords: Key topics or keywords mentioned in the document.
47-
48-
IMPORTANT RULES:
49-
1. Only include information explicitly stated in the document.
50-
2. If a field is not present or cannot be determined, leave it as null.
51-
3. For the abstract, include the text verbatim from the document.
52-
4. Do not fabricate or infer information that is not in the text.
53-
"""
19+
class ExtractMetadataWorkflowRequest(BaseModel):
20+
"""Workflow request to extract PDF content and generate metadata suggestions."""
5421

55-
# metadata_agent = Agent(
56-
# "openai:gpt-4o-mini",
57-
# instructions=METADATA_INSTRUCTIONS,
58-
# output_type=DocumentMetadata,
59-
# name="metadata_extractor",
60-
# )
61-
#
62-
# temporal_metadata_agent = TemporalAgent(
63-
# metadata_agent,
64-
# model_activity_config=workflow.ActivityConfig(
65-
# start_to_close_timeout=timedelta(minutes=5),
66-
# ),
67-
# )
68-
#
22+
workflow_id: str = Field(description="Workflow public_id (DB primary identifier)")
23+
tenant_id: str = Field(description="Tenant id (ownership check)")
24+
url: str
25+
extractor: str = "pdfplumber"
26+
pages: list[int] | None = None
6927

7028

7129
@workflow.defn
7230
class ExtractMetadata(PydanticAIWorkflow):
7331
"""Workflow that extracts content from a PDF and uses an LLM to extract metadata."""
7432

7533
@workflow.run
76-
async def run(self, request_data: dict) -> DocumentMetadata:
77-
"""Execute the metadata extraction workflow.
78-
79-
Args:
80-
request_data: Dictionary containing PDF extraction parameters
81-
(url, extractor, pages).
82-
83-
Returns:
84-
DocumentMetadata: Extracted metadata from the PDF document.
85-
"""
86-
# Activity 1: Extract PDF text
87-
content = await workflow.execute_activity(
88-
text_extraction,
89-
ExtractPdfContentRequest(**request_data),
90-
start_to_close_timeout=timedelta(minutes=5),
91-
)
92-
93-
# Activity 2: Extract metadata using LLM
94-
metadata = await workflow.execute_activity(
95-
metadata_extraction,
96-
ExtractMetadataRequest(text=content.text),
97-
start_to_close_timeout=timedelta(minutes=5),
34+
async def run(self, request_data: dict) -> MetadataResult:
35+
"""Execute the extraction + suggestions workflow."""
36+
request = ExtractMetadataWorkflowRequest(**request_data)
37+
try:
38+
# Activity 1: Extract PDF text
39+
content = await workflow.execute_activity(
40+
text_extraction,
41+
ExtractPdfContentRequest(
42+
url=request.url,
43+
extractor=request.extractor,
44+
pages=request.pages,
45+
),
46+
start_to_close_timeout=timedelta(minutes=5),
47+
)
48+
49+
# Activity 2: Generate metadata suggestions using LLM
50+
result = await workflow.execute_activity(
51+
metadata_extraction,
52+
ExtractMetadataRequest(text=content.text),
53+
start_to_close_timeout=timedelta(minutes=5),
54+
)
55+
except Exception:
56+
await workflow.execute_activity(
57+
store_workflow_result,
58+
StoreWorkflowResultRequest(
59+
workflow_id=request.workflow_id,
60+
tenant_id=request.tenant_id,
61+
status=WorkflowStatus.ERROR,
62+
result=None,
63+
),
64+
start_to_close_timeout=timedelta(minutes=1),
65+
)
66+
raise
67+
68+
await workflow.execute_activity(
69+
store_workflow_result,
70+
StoreWorkflowResultRequest(
71+
workflow_id=request.workflow_id,
72+
tenant_id=request.tenant_id,
73+
status=WorkflowStatus.SUCCESS,
74+
result=result.model_dump(),
75+
),
76+
start_to_close_timeout=timedelta(minutes=1),
9877
)
9978

100-
return metadata
79+
return result

app/workflows/suggestions.py

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
"""Typed metadata suggestions returned by the workflow."""
2+
3+
# from __future__ import annotations
4+
5+
from typing import Annotated, Literal
6+
7+
from pydantic import BaseModel, Field, field_validator
8+
9+
10+
class Creator(BaseModel):
11+
"""A structured creator/author."""
12+
13+
name: str # "Family, Given"
14+
affiliation: str | None = None
15+
orcid: str | None = None
16+
17+
@field_validator("name")
18+
@classmethod
19+
def normalize_name(cls, v: str) -> str:
20+
"""Normalize names to the canonical 'Family, Given' format."""
21+
cleaned = " ".join(v.split()).strip()
22+
if not cleaned:
23+
return cleaned
24+
if "," in cleaned:
25+
return cleaned
26+
parts = cleaned.split(" ")
27+
if len(parts) == 1:
28+
return f"{parts[0]},"
29+
family = parts[-1]
30+
given = " ".join(parts[:-1])
31+
return f"{family}, {given}"
32+
33+
34+
class TitleSuggestion(BaseModel):
35+
"""Suggestion for `title`."""
36+
37+
field: Literal["title"] = "title"
38+
value: str
39+
40+
41+
class DescriptionSuggestion(BaseModel):
42+
"""Suggestion for `description` (abstract)."""
43+
44+
field: Literal["description"] = "description"
45+
value: str
46+
47+
48+
class CreatorsSuggestion(BaseModel):
49+
"""Suggestion for `creators`."""
50+
51+
field: Literal["creators"] = "creators"
52+
value: list[Creator]
53+
54+
@field_validator("value")
55+
@classmethod
56+
def filter_empty_names(cls, v: list[Creator]) -> list[Creator]:
57+
"""Filter out creators with empty names."""
58+
return [c for c in v if c.name]
59+
60+
61+
MetadataSuggestion = Annotated[
62+
TitleSuggestion | DescriptionSuggestion | CreatorsSuggestion,
63+
Field(discriminator="field"),
64+
]
65+
66+
67+
class MetadataResult(BaseModel):
68+
"""Container for all metadata suggestions from a workflow run."""
69+
70+
suggestions: list[MetadataSuggestion]

0 commit comments

Comments
 (0)