Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions libs/community/langchain_community/document_loaders/parsers/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,10 +445,36 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
)
if img_filter in _PDF_FILTER_WITHOUT_LOSS:
height, width = xObject[obj]["/Height"], xObject[obj]["/Width"]
data = xObject[obj].get_data()
bits_per_component = xObject[obj].get(
"/BitsPerComponent", 8
)

np_image = np.frombuffer(
xObject[obj].get_data(), dtype=np.uint8
).reshape(height, width, -1)
try:
if bits_per_component == 1:
# 1-bit monochrome images pack 8 pixels per byte.
# Each row is padded to a byte boundary.
np_data = np.unpackbits(
np.frombuffer(data, dtype=np.uint8)
)
stride = ((width + 7) // 8) * 8
np_data = np_data.reshape(-1, stride)[
:height, :width
]
np_image = (np_data * 255).astype(
np.uint8
).reshape(height, width, 1)
else:
np_image = np.frombuffer(
data, dtype=np.uint8
).reshape(height, width, -1)
except ValueError:
logger.warning(
"Could not reshape lossless PDF image "
f"({height}x{width}, bpc={bits_per_component}, "
f"data={len(data)} bytes), skipping."
)
continue
elif img_filter in _PDF_FILTER_WITH_LOSS:
np_image = np.array(Image.open(io.BytesIO(xObject[obj].get_data())))

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import importlib
from pathlib import Path
from typing import Any, Iterator
from unittest.mock import MagicMock

import numpy as np
import pytest

import langchain_community.document_loaders.parsers as pdf_parsers
Expand Down Expand Up @@ -93,3 +95,128 @@ def test_parsers(
_assert_with_parser(parser, **params)
except ModuleNotFoundError:
pytest.skip(f"{parser_factory} skipped. Require '{require}'")


def _make_mock_xobject_image(
width: int,
height: int,
bits_per_component: int = 8,
filter_name: str = "FlateDecode",
data: bytes | None = None,
) -> MagicMock:
"""Create a mock PDF XObject image with the given properties.

Args:
width: Image width in pixels.
height: Image height in pixels.
bits_per_component: Bits per color component (1 or 8).
filter_name: PDF filter name (e.g. ``FlateDecode``).
data: Raw image bytes. Generated automatically if ``None``.

Returns:
A ``MagicMock`` that behaves like a PDF image XObject.
"""
if data is None:
if bits_per_component == 1:
row_bytes = (width + 7) // 8
data = bytes(row_bytes * height)
else:
data = bytes(width * height)

attrs = {
"/Subtype": "/Image",
"/Filter": f"/{filter_name}",
"/Height": height,
"/Width": width,
"/BitsPerComponent": bits_per_component,
}
img = MagicMock()
img.__getitem__ = lambda self, key: attrs[key]
img.get = lambda key, default=None: attrs.get(key, default)
img.get_data = MagicMock(return_value=data)
return img


class TestPyPDFParser1BitImages:
"""Tests for 1-bit monochrome image handling in PyPDFParser.

Regression tests for
https://github.com/langchain-ai/langchain-community/issues/307
"""

def test_1bit_image_does_not_raise(self) -> None:
"""A 1-bit monochrome image must not cause a ValueError."""
try:
importlib.import_module("pypdf")
except ModuleNotFoundError:
pytest.skip("pypdf not installed")

from langchain_community.document_loaders.parsers.pdf import PyPDFParser

width, height = 645, 430
row_bytes = (width + 7) // 8
raw_data = bytes(row_bytes * height) # 34,830 bytes

# Precondition: this data cannot be reshaped as 8-bit
assert len(raw_data) != width * height

img = _make_mock_xobject_image(
width=width,
height=height,
bits_per_component=1,
filter_name="CCITTFaxDecode",
data=raw_data,
)

xobject_dict = {"img0": img}
xobject_mock = MagicMock()
xobject_mock.get_object = MagicMock(return_value=xobject_dict)
xobject_mock.__iter__ = lambda self: iter(xobject_dict)

page = MagicMock()
resources: dict[str, Any] = {"/XObject": xobject_mock}
page.__getitem__ = lambda self, key: resources

parser = PyPDFParser(extract_images=False)
result = parser.extract_images_from_page(page)
assert isinstance(result, str)

def test_1bit_pixel_values(self) -> None:
"""Verify that unpacked 1-bit pixels are scaled to 0 and 255."""
width, height = 8, 2
raw_data = bytes([0b10101010, 0b11110000])

np_data = np.unpackbits(np.frombuffer(raw_data, dtype=np.uint8))
stride = ((width + 7) // 8) * 8
np_data = np_data.reshape(-1, stride)[:height, :width]
np_image = (np_data * 255).astype(np.uint8).reshape(height, width, 1)

assert np_image.shape == (2, 8, 1)
assert np_image[0, 0, 0] == 255
assert np_image[0, 1, 0] == 0
assert np_image[1, 0, 0] == 255
assert np_image[1, 4, 0] == 0

def test_1bit_row_padding(self) -> None:
"""Verify correct handling when row width is not a multiple of 8."""
width, height = 10, 1
raw_data = bytes([0xFF, 0xC0])

np_data = np.unpackbits(np.frombuffer(raw_data, dtype=np.uint8))
stride = ((width + 7) // 8) * 8
np_data = np_data.reshape(-1, stride)[:height, :width]
np_image = (np_data * 255).astype(np.uint8).reshape(height, width, 1)

assert np_image.shape == (1, 10, 1)
assert np.all(np_image[:, :, 0] == 255)

def test_8bit_image_unaffected(self) -> None:
"""Standard 8-bit images must still work after the fix."""
width, height = 10, 10
raw_data = bytes(width * height)

np_image = np.frombuffer(raw_data, dtype=np.uint8).reshape(
height, width, -1
)

assert np_image.shape == (10, 10, 1)