Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
1 change: 1 addition & 0 deletions py/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@
"RetrievalService",
"GraphService",
"AudioParser",
"VideoParser",
"BMPParser",
"DOCParser",
"DOCXParser",
Expand Down
1 change: 1 addition & 0 deletions py/core/parsers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

__all__ = [
"AudioParser",
"VideoParser",
"BMPParser",
"DOCParser",
"DOCXParser",
Expand Down
2 changes: 2 additions & 0 deletions py/core/parsers/media/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,11 @@
from .ppt_parser import PPTParser
from .pptx_parser import PPTXParser
from .rtf_parser import RTFParser
from .video_parser import VideoParser

__all__ = [
"AudioParser",
"VideoParser",
"BMPParser",
"DOCParser",
"DOCXParser",
Expand Down
163 changes: 163 additions & 0 deletions py/core/parsers/media/video_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
import base64
import logging
from typing import AsyncGenerator

from core.base.abstractions import GenerationConfig
from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
DatabaseProvider,
IngestionConfig,
)

logger = logging.getLogger(__name__)


class VideoParser(AsyncParser[str | bytes]):
"""
A parser for video files.
"""

# Mapping of file extensions to MIME types
MIME_TYPE_MAPPING = {
"mp4": "video/mp4",
"avi": "video/avi",
"mov": "video/quicktime",
"mkv": "video/x-matroska",
}

def __init__(
self,
config: IngestionConfig,
database_provider: DatabaseProvider,
llm_provider: CompletionProvider,
):
super().__init__()

self.config = config
self.database_provider = database_provider
self.llm_provider = llm_provider

self.vlm = (
self.config.vlm or self.config.app.vlm if self.config.app else None
)
self.video_prompt_name = "video_understanding"
self.video_prompt_args: dict = {}

logger.info(
"Video parser initialized with default prompt template: %s and vlm: %s",
self.video_prompt_name,
self.vlm,
)

async def ingest( # type: ignore[override]
self, data: str | bytes, **kwargs
) -> AsyncGenerator[str, None]:
"""
Process video file for ingestion.

Args:
data: The video data to process, file url or raw bytes.
**kwargs: Additional arguments:
- file_type: The type of the video file (e.g., "mp4", "avi").
- bytes_limit: Optional limit for raw bytes size.
- vlm: Optional vision model to use for processing.
- prompt_name: Optional name of the prompt template to use.
- input_args: Optional arguments for the prompt template.

Yields:
str: Generated descriptions from video and audio analysis
"""
extra_fields = kwargs.get("extra_fields", {})
file_type = extra_fields.get("file_type")
if not file_type:
raise ValueError("file_type must be provided")
if file_type not in self.MIME_TYPE_MAPPING:
raise ValueError(
f"file type must be one of {list(self.MIME_TYPE_MAPPING.keys())}"
)
bytes_limit = extra_fields.get(
"bytes_limit", 5 * 1024 * 1024
) # Default to 5MB
if not isinstance(bytes_limit, int):
raise ValueError("bytes_limit must be an integer")

vlm = extra_fields.get("vlm")
prompt_name = extra_fields.get("prompt_name")
input_args = extra_fields.get("input_args")
if isinstance(data, bytes):
if (
bytes_limit is None
or bytes_limit < 0
or bytes_limit > 5 * 1024 * 1024
):
raise ValueError(
"bytes_limit must be a positive integer up to 5MB"
)
if len(data) > bytes_limit:
raise ValueError(
f"file raw bytes size must be less than {bytes_limit} bytes"
)

if isinstance(data, str):
url_or_base64 = data
elif isinstance(data, bytes):
base564str = base64.b64encode(data).decode("utf-8")
url_or_base64 = f"data:video/{file_type};base64,{base564str}"

model = vlm or self.vlm
if not model:
raise ValueError("Vision model (vlm) must be provided")
generation_config = GenerationConfig(
model=model,
stream=False,
)

prompt_name = prompt_name or self.video_prompt_name
prompt_args = input_args or self.video_prompt_args
prompts_handler = (
self.database_provider.prompts_handler
if hasattr(self.database_provider, "prompts_handler")
else None
)
if not prompts_handler:
raise ValueError(
"Prompts handler is not available in the provider"
)
video_prompt_text = await prompts_handler.get_cached_prompt(
prompt_name=prompt_name,
inputs=prompt_args,
)

messages = [
{
"role": "user",
"content": [
{
"type": "video_url",
"video_url": {"url": url_or_base64},
},
{"type": "text", "text": video_prompt_text},
],
}
]

try:
response = await self.llm_provider.aget_completion(
messages=messages, generation_config=generation_config
)

if not response.choices or not response.choices[0].message:
raise ValueError("No response content")

content = response.choices[0].message.content
if not content:
raise ValueError("Response content is empty")

yield content

except Exception as e:
logger.error(
f"Error processing file {url_or_base64[:50]}: {str(e)}"
)
raise
46 changes: 46 additions & 0 deletions py/core/providers/database/prompts/vision_video.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
video_understanding:
template: >
# Video-Understanding Prompt

## Objective
Produce a text-only reconstruction of the video that allows a reader to experience it without ever pressing play.

---

## Output Structure (Markdown)

### 1. Title
A single-line, evocative summary
Example:
`Neon-Drenched Midnight Ramen: A Slow-Motion Culinary Ode`

---

### 2. Scene-by-Scene Timeline
| Timestamp | Visuals & Camera | Audio & Speech | On-Screen Text / Graphics | Mood |
|-----------|------------------|----------------|---------------------------|------|
| 00:00-00:02 | Fade-in from black; overhead establishing shot of a cramped Tokyo kitchen; warm tungsten glow | Low sizzle of pork fat, subtle city hum outside | White kanji on black: Episode 7 | Anticipatory, intimate |
| 00:02-00:07 | Hand-held camera glides forward; steam curls like incense above a rolling boil | Water bubbles crescendo; no dialogue | None | Focused, almost meditative |
| … | … | … | … | … |

---

### 3. Transcript (Verbatim)
- 00:10-00:15
Male narrator, deep & calm, Japanese with English subtitles:
“In an alley off Shibuya, at 2 a.m., the last customer orders a bowl of tonkotsu.”

---

### 4. Environment & Object Inventory
- Countertop: Stainless steel reflects overhead bulb; soy-sauce bottle labeled Yamasho in red calligraphy; digital timer shows 03:47.
- Ingredients: Chashu cross-section reveals spiral fat marbling; green onions diced to 2 mm cubes.

---

### 5. Style & Technical Notes
- Color palette: High-contrast warm tones—dominant reds & ambers, saturation ≈ 75 %.
- Motion: 60 fps slowed to 30 fps; subtle handheld micro-shakes.
- Soundtrack: Lo-fi hip-hop loop, 72 BPM, sparse piano chords every 8 bars.

input_types: {}
11 changes: 11 additions & 0 deletions py/core/providers/ingestion/r2r/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ class R2RIngestionProvider(IngestionProvider):
DocumentType.HEIC: parsers.ImageParser,
DocumentType.SVG: parsers.ImageParser,
DocumentType.MP3: parsers.AudioParser,
DocumentType.MP4: parsers.VideoParser,
DocumentType.AVI: parsers.VideoParser,
DocumentType.MOV: parsers.VideoParser,
DocumentType.MKV: parsers.VideoParser,
DocumentType.P7S: parsers.P7SParser,
DocumentType.RST: parsers.RSTParser,
DocumentType.RTF: parsers.RTFParser,
Expand Down Expand Up @@ -94,6 +98,13 @@ class R2RIngestionProvider(IngestionProvider):
DocumentType.SVG,
}

VIDEO_TYPES = {
DocumentType.MP4,
DocumentType.AVI,
DocumentType.MOV,
DocumentType.MKV,
}

def __init__(
self,
config: R2RIngestionConfig,
Expand Down
6 changes: 6 additions & 0 deletions py/shared/abstractions/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ class DocumentType(str, Enum):
# Audio
MP3 = "mp3"

# Video
MP4 = "mp4"
AVI = "avi"
MOV = "mov"
MKV = "mkv"

# CSV
CSV = "csv"

Expand Down
103 changes: 103 additions & 0 deletions py/tests/unit/parsers/test_video_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import pytest
import pytest_asyncio
from unittest.mock import AsyncMock, MagicMock

from core.parsers.media.video_parser import VideoParser


class DummyApp:
vlm = "test-vlm"


class DummyConfig:
vlm = None
app = DummyApp()


@pytest_asyncio.fixture
def mock_db_provider():
mock = MagicMock()
mock.prompts_handler.get_cached_prompt = AsyncMock(
return_value="prompt text"
)
return mock


@pytest_asyncio.fixture
def mock_llm_provider():
mock = MagicMock()
mock.aget_completion = AsyncMock(
return_value=MagicMock(
choices=[MagicMock(message=MagicMock(content="video description"))]
)
)
return mock


@pytest.mark.asyncio
async def test_ingest_str_success(mock_db_provider, mock_llm_provider):
parser = VideoParser(
config=DummyConfig(),
database_provider=mock_db_provider,
llm_provider=mock_llm_provider,
)
gen = parser.ingest(
"http://test/video.mp4", extra_fields={"file_type": "mp4", "bytes_limit": 120}
)
result = [x async for x in gen]
assert result == ["video description"]


@pytest.mark.asyncio
async def test_ingest_bytes_success(mock_db_provider, mock_llm_provider):
parser = VideoParser(
config=DummyConfig(),
database_provider=mock_db_provider,
llm_provider=mock_llm_provider,
)
data = b"1234"
gen = parser.ingest(data, extra_fields={"file_type": "mp4", "bytes_limit": 10})
result = [x async for x in gen]
assert result == ["video description"]


@pytest.mark.asyncio
async def test_ingest_invalid_file_type(mock_db_provider, mock_llm_provider):
parser = VideoParser(
config=DummyConfig(),
database_provider=mock_db_provider,
llm_provider=mock_llm_provider,
)
with pytest.raises(ValueError):
gen = parser.ingest("http://test/video.xyz", extra_fields={"file_type": "xyz"})
[x async for x in gen]


@pytest.mark.asyncio
async def test_ingest_bytes_limit_exceeded(
mock_db_provider, mock_llm_provider
):
parser = VideoParser(
config=DummyConfig(),
database_provider=mock_db_provider,
llm_provider=mock_llm_provider,
)
data = b"1" * 11
with pytest.raises(ValueError):
gen = parser.ingest(data, extra_fields={"file_type": "mp4", "bytes_limit": 10})
[x async for x in gen]


@pytest.mark.asyncio
async def test_ingest_llm_no_response(mock_db_provider, mock_llm_provider):
mock_llm_provider.aget_completion = AsyncMock(
return_value=MagicMock(choices=[])
)
parser = VideoParser(
config=DummyConfig(),
database_provider=mock_db_provider,
llm_provider=mock_llm_provider,
)
gen = parser.ingest("http://test/video.mp4", extra_fields={"file_type": "mp4"})
with pytest.raises(ValueError):
[x async for x in gen]
Loading