diff --git a/pypdf/_font.py b/pypdf/_font.py index 54e05b7dc..ce9bc7b35 100644 --- a/pypdf/_font.py +++ b/pypdf/_font.py @@ -2,11 +2,12 @@ from dataclasses import dataclass, field from typing import Any, Optional, Union, cast -from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject +from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, StreamObject from ._cmap import get_encoding from ._codecs.adobe_glyphs import adobe_glyphs from ._utils import logger_warning +from .errors import PdfReadError @dataclass(frozen=True) @@ -32,6 +33,7 @@ class FontDescriptor: bbox: tuple[float, float, float, float] = field(default_factory=lambda: (-100.0, -200.0, 1000.0, 900.0)) character_widths: dict[str, int] = field(default_factory=lambda: {"default": 500}) + font_file: Union[StreamObject, None] = None @staticmethod def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: DictionaryObject) -> dict[str, Any]: @@ -59,6 +61,18 @@ def _parse_font_descriptor(font_kwargs: dict[str, Any], font_descriptor_obj: Dic bbox_tuple = tuple(map(float, font_kwargs["bbox"])) assert len(bbox_tuple) == 4, bbox_tuple font_kwargs["bbox"] = bbox_tuple + # Find the binary stream for this font if there is one + for source_key in ["/FontFile", "/FontFile2", "/FontFile3"]: + if source_key in font_descriptor_dict: + if "font_file" in font_kwargs: + raise PdfReadError(f"More than one /FontFile found in {font_descriptor_obj}") + + try: + font_file = font_descriptor_dict[source_key].get_object() + font_kwargs["font_file"] = font_file + except PdfReadError as e: + logger_warning(f"Failed to get '{source_key}' in {font_descriptor_dict}: {e}", __name__) + return font_kwargs @staticmethod diff --git a/tests/test_cmap.py b/tests/test_cmap.py index 76cb46c48..0794a1057 100644 --- a/tests/test_cmap.py +++ b/tests/test_cmap.py @@ -8,7 +8,7 @@ from pypdf._cmap import get_encoding, parse_bfchar from pypdf._codecs import charset_encoding from pypdf._font import Font -from pypdf.generic import ArrayObject, DictionaryObject, IndirectObject, NameObject, NullObject +from pypdf.generic import ArrayObject, DictionaryObject, EncodedStreamObject, IndirectObject, NameObject, NullObject from . import get_data_from_url @@ -139,6 +139,8 @@ def test_iss1533(): reader.pages[0].extract_text() # no error font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F"]) assert font.character_map["\x01"] == "Ü" + assert type(font.font_descriptor.font_file) is EncodedStreamObject + assert font.font_descriptor.font_file["/Subtype"] == "/CIDFontType0C" @pytest.mark.enable_socket diff --git a/tests/test_font.py b/tests/test_font.py index 04bdb8e47..cb1eec5ab 100644 --- a/tests/test_font.py +++ b/tests/test_font.py @@ -1,7 +1,16 @@ """Test font-related functionality.""" +from pathlib import Path -from pypdf._font import FontDescriptor -from pypdf.generic import DictionaryObject, NameObject +import pytest + +from pypdf import PdfReader +from pypdf._font import Font, FontDescriptor +from pypdf.errors import PdfReadError +from pypdf.generic import DictionaryObject, EncodedStreamObject, NameObject + +TESTS_ROOT = Path(__file__).parent.resolve() +PROJECT_ROOT = TESTS_ROOT.parent +RESOURCE_ROOT = PROJECT_ROOT / "resources" def test_font_descriptor(): @@ -28,3 +37,29 @@ def test_font_descriptor(): assert my_font.italic_angle == 0 assert my_font.flags == 33 assert my_font.bbox == (-113.0, -250.0, 749.0, 801.0) + + +def test_font_file(): + reader = PdfReader(RESOURCE_ROOT / "multilang.pdf") + + # /FontFile + font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F2"]) + assert type(font.font_descriptor.font_file) is EncodedStreamObject + assert len(font.font_descriptor.font_file.get_data()) == 5116 + + # /FontFile2 + font_resource = reader.pages[0]["/Resources"]["/Font"]["/F1"] + font = Font.from_font_resource(font_resource) + assert type(font.font_descriptor.font_file) is EncodedStreamObject + assert len(font.font_descriptor.font_file.get_data()) == 28464 + + with pytest.raises(PdfReadError) as exception: + font_resource[NameObject("/FontDescriptor")][NameObject("/FontFile")] = NameObject("xyz") + font = Font.from_font_resource(font_resource) + assert "More than one /FontFile" in exception.value.args[0] + + # /FontFile3 + reader = PdfReader(RESOURCE_ROOT / "attachment.pdf") + font = Font.from_font_resource(reader.pages[0]["/Resources"]["/Font"]["/F1"]) + assert type(font.font_descriptor.font_file) is EncodedStreamObject + assert len(font.font_descriptor.font_file.get_data()) == 2168 diff --git a/tests/test_text_extraction.py b/tests/test_text_extraction.py index f0e5a759b..0f49e22cb 100644 --- a/tests/test_text_extraction.py +++ b/tests/test_text_extraction.py @@ -143,6 +143,7 @@ def test_font_class_to_dict(): "x_height": 500.0, "italic_angle": 0.0, "flags": 32, + "font_file": None, "bbox": ( -100.0, -200.0,