Skip to content
50 changes: 33 additions & 17 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,21 +1576,36 @@ def add_metadata(self, infos: dict[str, Any]) -> None:
self._info = DictionaryObject()
self._info.update(args)

_UNSET = object()

def compress_identical_objects(
self,
remove_identicals_old: Any = _UNSET,
remove_orphans: Any = _UNSET,
*,
remove_identicals: bool = True,
remove_orphans: bool = True,
remove_unreferenced: bool = True,
) -> None:
"""
Parse the PDF file and merge objects that have the same hash.
Parse the PDF file objects that have the same hash.
This will make objects common to multiple pages.
Recommended to be used just before writing output.

Args:
remove_identicals_old: Positional arguement, used while remove_orphans is still being deprecated.
remove_orphans: Remove unreferenced objects; deprecated use remove_unreferenced.
remove_identicals: Remove identical objects.
remove_orphans: Remove unreferenced objects.
remove_unreferenced: Remove unreferenced objects.

"""
if remove_identicals_old != self._UNSET:
# Deprecate indicating keyword-only is supported.
assert isinstance(remove_identicals_old, bool) # Check type!
remove_identicals = remove_identicals_old
if remove_orphans != self._UNSET:
# Deprecate with new name and keyword-only.
assert isinstance(remove_orphans, bool) # Check type!
remove_unreferenced = remove_orphans

def replace_in_obj(
obj: PdfObject, crossref: dict[IndirectObject, IndirectObject]
Expand All @@ -1604,17 +1619,17 @@ def replace_in_obj(
assert isinstance(obj, (DictionaryObject, ArrayObject))
for k, v in key_val:
if isinstance(v, IndirectObject):
orphans[v.idnum - 1] = False
unreferenced[v.idnum - 1] = False
if v in crossref:
obj[k] = crossref[v]
else:
"""the filtering on DictionaryObject and ArrayObject only
"""The filtering on DictionaryObject and ArrayObject only
will be performed within replace_in_obj"""
replace_in_obj(v, crossref)

# _idnum_hash :dict[hash]=(1st_ind_obj,[other_indir_objs,...])
# _idnum_hash: dict[hash] = (1st_ind_obj, [2nd_ind_obj,...])
self._idnum_hash = {}
orphans = [True] * len(self._objects)
unreferenced = [True] * len(self._objects)
# look for similar objects
for idx, obj in enumerate(self._objects):
if is_null_or_none(obj):
Expand All @@ -1639,18 +1654,19 @@ def replace_in_obj(
if isinstance(obj, (DictionaryObject, ArrayObject)):
replace_in_obj(obj, cnv_rev)

# remove orphans (if applicable)
orphans[self.root_object.indirect_reference.idnum - 1] = False # type: ignore
if remove_unreferenced:
unreferenced[self.root_object.indirect_reference.idnum - 1] = False # type: ignore

if not is_null_or_none(self._info):
orphans[self._info.indirect_reference.idnum - 1] = False # type: ignore
if not is_null_or_none(self._info):
unreferenced[self._info.indirect_reference.idnum - 1] = False # type: ignore

try:
orphans[self._ID.indirect_reference.idnum - 1] = False # type: ignore
except AttributeError:
pass
for i in compress(range(len(self._objects)), orphans):
self._objects[i] = None
try:
unreferenced[self._ID.indirect_reference.idnum - 1] = False # type: ignore
except AttributeError:
pass

for i in compress(range(len(self._objects)), unreferenced):
self._objects[i] = None

def get_reference(self, obj: PdfObject) -> IndirectObject:
idnum = self._objects.index(obj) + 1
Expand Down
1 change: 1 addition & 0 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ def test_transform_compress_identical_objects():
for page in writer.pages:
op = Transformation().scale(sx=0.8, sy=0.8)
page.add_transformation(op)
writer.add_page(page)
writer.compress_identical_objects()
bytes_out = BytesIO()
writer.write(bytes_out)
Expand Down