Skip to content

Commit d480dab

Browse files
committed
feat(suggestions): return suggestions from metadata extraction and persist
1 parent 33bf36c commit d480dab

File tree

5 files changed

+204
-112
lines changed

5 files changed

+204
-112
lines changed

app/activities/extract_metadata.py

Lines changed: 48 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,58 +1,76 @@
1-
"""Simple LLM-based metadata extraction activity."""
2-
3-
import os
1+
"""LLM-based metadata suggestions activity."""
42

53
from pydantic import BaseModel, Field
64
from pydantic_ai import Agent
75
from pydantic_ai.models.openai import OpenAIChatModel
86
from pydantic_ai.providers.litellm import LiteLLMProvider
7+
from pydantic_ai.providers.ollama import OllamaProvider
98
from temporalio import activity
109

10+
from app.config import get_settings
11+
from app.workflows.suggestions import MetadataResult
1112

12-
class DocumentMetadata(BaseModel):
13-
"""Metadata extracted from a document."""
14-
15-
title: str = Field(description="Main title of the document")
16-
abstract: str | None = Field(default=None, description="Document abstract")
17-
authors: list[str] = Field(
18-
default_factory=list, description="List of document authors"
19-
)
13+
def _parse_llm(llm: str) -> tuple[str, str]:
14+
provider, sep, model_name = llm.partition("/")
15+
if not sep:
16+
raise ValueError("Invalid LLM; expected '<provider>/<model>'")
17+
provider = provider.strip().lower()
18+
model_name = model_name.strip()
19+
if provider not in {"litellm", "ollama"}:
20+
raise ValueError("Invalid LLM; provider must be 'litellm' or 'ollama'")
21+
if not model_name:
22+
raise ValueError("Invalid LLM; model name is missing")
23+
return provider, model_name
2024

2125

2226
class ExtractMetadataRequest(BaseModel):
23-
"""Request to extract metadata from document text."""
27+
"""Request to generate metadata suggestions from document text."""
2428

2529
text: str = Field(description="Document text to analyze")
26-
model: str = Field(default="groq/qwen/qwen3-32b", description="Model to use")
2730

2831

2932
INSTRUCTIONS = """\
30-
Extract structured metadata from this document text.
31-
Focus on finding the title, abstract/summary, and authors.
32-
For authors, extract individual names as separate list items.
33-
Only include information that is clearly stated in the text.
33+
You generate metadata suggestions from document text.
34+
35+
Return a list of typed suggestions for the following fields:
36+
- title (string)
37+
- description (string; the abstract/summary)
38+
- creators (list of objects with: name, affiliation (optional), orcid (optional))
39+
40+
Rules:
41+
- Only include information that is clearly stated in the text.
42+
- If a field is not present or cannot be determined, omit that suggestion entirely.
43+
- For creators.name, use the "Family, Given" format.
3444
"""
3545

3646

37-
def _create_model(model_name: str):
38-
"""Create a model using LiteLLM provider."""
39-
return OpenAIChatModel(
40-
model_name=model_name,
41-
provider=LiteLLMProvider(
42-
api_base="https://llmgw-litellm.web.cern.ch/v1",
43-
api_key=os.environ["LITELLM_API_KEY"],
44-
),
45-
)
47+
def _create_model() -> OpenAIChatModel:
48+
"""Create an OpenAI-compatible chat model from settings."""
49+
settings = get_settings()
50+
provider_name, model_name = _parse_llm(settings.llm)
51+
52+
if provider_name == "ollama":
53+
provider = OllamaProvider(
54+
base_url=settings.ollama_base_url,
55+
api_key=settings.ollama_api_key,
56+
)
57+
else:
58+
provider = LiteLLMProvider(
59+
api_base=settings.litellm_api_base,
60+
api_key=settings.litellm_api_key,
61+
)
62+
63+
return OpenAIChatModel(model_name=model_name, provider=provider)
4664

4765

4866
@activity.defn
49-
async def metadata_extraction(request: ExtractMetadataRequest) -> DocumentMetadata:
50-
"""Extract metadata using LLM."""
51-
model = _create_model(request.model)
67+
async def metadata_extraction(request: ExtractMetadataRequest) -> MetadataResult:
68+
"""Generate typed metadata suggestions using an LLM."""
69+
model = _create_model()
5270
agent = Agent(
5371
model=model,
5472
instructions=INSTRUCTIONS,
55-
output_type=DocumentMetadata,
73+
output_type=MetadataResult,
5674
)
5775

5876
result = await agent.run(request.text)

app/routers/workflows.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,12 +100,18 @@ async def create_workflow(
100100
client = _get_temporal_client(request)
101101
await client.start_workflow(
102102
ExtractMetadata.run,
103-
args=[{"url": body.url, "extractor": body.extractor, "pages": body.pages}],
103+
args=[
104+
{
105+
"workflow_id": workflow_id,
106+
"tenant_id": auth.tenant_id,
107+
"url": body.url,
108+
"extractor": body.extractor,
109+
"pages": body.pages,
110+
}
111+
],
104112
id=f"extract-metadata-{workflow_id}",
105113
task_queue="extract-pdf-metadata-task-queue",
106114
)
107-
workflow.status = WorkflowStatus.SUCCESS
108-
session.commit()
109115
except Exception as e:
110116
print("Error(start_temporal_workflow)", e)
111117
try:
@@ -117,7 +123,7 @@ async def create_workflow(
117123
status_code=500, detail="Could not start extraction workflow"
118124
)
119125

120-
return {"public_id": workflow_id, "status": "PROCESSING"}
126+
return workflow.to_dict()
121127

122128

123129
@router.get(

app/workflows/extract_metadata_workflow.py

Lines changed: 54 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -8,93 +8,69 @@
88

99
from app.activities.extract_metadata import ExtractMetadataRequest, metadata_extraction
1010
from app.activities.extract_pdf_content import ExtractPdfContentRequest, text_extraction
11+
from app.activities.store_workflow_result import StoreWorkflowResultRequest, store_workflow_result
12+
from app.database.models import WorkflowStatus
13+
from app.workflows.suggestions import MetadataResult
1114

1215

13-
class DocumentMetadata(BaseModel):
14-
"""Structured metadata extracted from a PDF document."""
16+
class ExtractMetadataWorkflowRequest(BaseModel):
17+
"""Workflow request to extract PDF content and generate metadata suggestions."""
1518

16-
title: str | None = Field(default=None, description="The title of the document")
17-
authors: list[str] | None = Field(
18-
default=None, description="List of document authors"
19-
)
20-
publication_date: str | None = Field(
21-
default=None,
22-
description="Publication date in ISO format (YYYY-MM-DD, YYYY-MM, or YYYY)",
23-
)
24-
abstract: str | None = Field(
25-
default=None,
26-
description="Abstract or summary of the document, extracted verbatim",
27-
)
28-
language: str | None = Field(
29-
default=None, description="Language of the document (e.g. 'en', 'fr')"
30-
)
31-
keywords: list[str] | None = Field(
32-
default=None, description="Key topics or keywords from the document"
33-
)
34-
35-
36-
METADATA_INSTRUCTIONS = """\
37-
You are an expert at extracting structured metadata from documents.
38-
39-
Given the raw text content of a PDF document, extract the following metadata fields:
40-
- title: The main title of the document.
41-
- authors: A list of authors. Look for names near the title,
42-
in headers, or in an authors section.
43-
- publication_date: The publication date in ISO format (YYYY-MM-DD, YYYY-MM, or YYYY).
44-
- abstract: The abstract or summary, extracted verbatim from the document.
45-
- language: The language the document is written in (ISO 639-1 code, e.g. "en").
46-
- keywords: Key topics or keywords mentioned in the document.
47-
48-
IMPORTANT RULES:
49-
1. Only include information explicitly stated in the document.
50-
2. If a field is not present or cannot be determined, leave it as null.
51-
3. For the abstract, include the text verbatim from the document.
52-
4. Do not fabricate or infer information that is not in the text.
53-
"""
54-
55-
# metadata_agent = Agent(
56-
# "openai:gpt-4o-mini",
57-
# instructions=METADATA_INSTRUCTIONS,
58-
# output_type=DocumentMetadata,
59-
# name="metadata_extractor",
60-
# )
61-
#
62-
# temporal_metadata_agent = TemporalAgent(
63-
# metadata_agent,
64-
# model_activity_config=workflow.ActivityConfig(
65-
# start_to_close_timeout=timedelta(minutes=5),
66-
# ),
67-
# )
68-
#
19+
workflow_id: str = Field(description="Workflow public_id (DB primary identifier)")
20+
tenant_id: str = Field(description="Tenant id (ownership check)")
21+
url: str
22+
extractor: str = "pdfplumber"
23+
pages: list[int] | None = None
6924

7025

7126
@workflow.defn
7227
class ExtractMetadata(PydanticAIWorkflow):
7328
"""Workflow that extracts content from a PDF and uses an LLM to extract metadata."""
7429

7530
@workflow.run
76-
async def run(self, request_data: dict) -> DocumentMetadata:
77-
"""Execute the metadata extraction workflow.
78-
79-
Args:
80-
request_data: Dictionary containing PDF extraction parameters
81-
(url, extractor, pages).
82-
83-
Returns:
84-
DocumentMetadata: Extracted metadata from the PDF document.
85-
"""
86-
# Activity 1: Extract PDF text
87-
content = await workflow.execute_activity(
88-
text_extraction,
89-
ExtractPdfContentRequest(**request_data),
90-
start_to_close_timeout=timedelta(minutes=5),
91-
)
92-
93-
# Activity 2: Extract metadata using LLM
94-
metadata = await workflow.execute_activity(
95-
metadata_extraction,
96-
ExtractMetadataRequest(text=content.text),
97-
start_to_close_timeout=timedelta(minutes=5),
31+
async def run(self, request_data: dict) -> MetadataResult:
32+
"""Execute the extraction + suggestions workflow."""
33+
request = ExtractMetadataWorkflowRequest(**request_data)
34+
try:
35+
# Activity 1: Extract PDF text
36+
content = await workflow.execute_activity(
37+
text_extraction,
38+
ExtractPdfContentRequest(
39+
url=request.url,
40+
extractor=request.extractor,
41+
pages=request.pages,
42+
),
43+
start_to_close_timeout=timedelta(minutes=5),
44+
)
45+
46+
# Activity 2: Generate metadata suggestions using LLM
47+
result = await workflow.execute_activity(
48+
metadata_extraction,
49+
ExtractMetadataRequest(text=content.text),
50+
start_to_close_timeout=timedelta(minutes=5),
51+
)
52+
except Exception:
53+
await workflow.execute_activity(
54+
store_workflow_result,
55+
StoreWorkflowResultRequest(
56+
workflow_id=request.workflow_id,
57+
tenant_id=request.tenant_id,
58+
status=WorkflowStatus.ERROR,
59+
result=None,
60+
),
61+
start_to_close_timeout=timedelta(minutes=1),
62+
)
63+
raise
64+
65+
await workflow.execute_activity(
66+
store_workflow_result,
67+
StoreWorkflowResultRequest(
68+
workflow_id=request.workflow_id,
69+
tenant_id=request.tenant_id,
70+
status=WorkflowStatus.SUCCESS,
71+
result=result.model_dump(),
72+
),
73+
start_to_close_timeout=timedelta(minutes=1),
9874
)
9975

100-
return metadata
76+
return result

app/workflows/suggestions.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
"""Typed metadata suggestions returned by the workflow."""
2+
3+
# from __future__ import annotations
4+
5+
from typing import Annotated, Literal
6+
7+
from pydantic import BaseModel, Field, field_validator
8+
9+
10+
class Creator(BaseModel):
11+
"""A structured creator/author."""
12+
13+
name: str # "Family, Given"
14+
affiliation: str | None = None
15+
orcid: str | None = None
16+
17+
@field_validator("name")
18+
@classmethod
19+
def normalize_name(cls, v: str) -> str:
20+
"""Normalize names to the canonical 'Family, Given' format."""
21+
cleaned = " ".join(v.split()).strip()
22+
if not cleaned:
23+
return cleaned
24+
if "," in cleaned:
25+
return cleaned
26+
parts = cleaned.split(" ")
27+
if len(parts) == 1:
28+
return f"{parts[0]},"
29+
family = parts[-1]
30+
given = " ".join(parts[:-1])
31+
return f"{family}, {given}"
32+
33+
34+
class TitleSuggestion(BaseModel):
35+
"""Suggestion for `title`."""
36+
37+
field: Literal["title"] = "title"
38+
value: str
39+
40+
41+
class DescriptionSuggestion(BaseModel):
42+
"""Suggestion for `description` (abstract)."""
43+
44+
field: Literal["description"] = "description"
45+
value: str
46+
47+
48+
class CreatorsSuggestion(BaseModel):
49+
"""Suggestion for `creators`."""
50+
51+
field: Literal["creators"] = "creators"
52+
value: list[Creator]
53+
54+
@field_validator("value")
55+
@classmethod
56+
def filter_empty_names(cls, v: list[Creator]) -> list[Creator]:
57+
return [c for c in v if c.name]
58+
59+
60+
MetadataSuggestion = Annotated[
61+
TitleSuggestion | DescriptionSuggestion | CreatorsSuggestion,
62+
Field(discriminator="field"),
63+
]
64+
65+
66+
class MetadataResult(BaseModel):
67+
"""Container for all metadata suggestions from a workflow run."""
68+
69+
suggestions: list[MetadataSuggestion]

tests/test_auth.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,3 +503,26 @@ def test_list_workflows_tenant_isolation(client, db_session):
503503
assert len(workflows) == 1
504504
assert workflows[0]["tenant_id"] == "tenant-b"
505505
assert workflows[0]["public_id"] == wf_b.public_id
506+
507+
508+
def test_read_workflow_includes_result(client, db_session):
509+
"""GET /workflows/{id} includes `result.suggestions` when present."""
510+
wf = Workflow(
511+
status=WorkflowStatus.SUCCESS,
512+
url="https://example.com/test.pdf",
513+
tenant_id="tenant-a",
514+
result={"suggestions": [{"field": "title", "value": "My Title"}]},
515+
)
516+
db_session.add(wf)
517+
db_session.commit()
518+
db_session.refresh(wf)
519+
520+
token = generate_test_token()
521+
response = client.get(
522+
f"/workflows/{wf.public_id}",
523+
headers={"Authorization": f"Bearer {token}"},
524+
)
525+
assert response.status_code == 200
526+
payload = response.json()
527+
assert "result" in payload
528+
assert payload["result"]["suggestions"][0]["field"] == "title"

0 commit comments

Comments
 (0)