Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -1634,6 +1634,7 @@ raw template
{%- endblock in_prompt -%}
"""


exporter_attr = AttrExporter()
output_attr, _ = exporter_attr.from_notebook_node(nb)
assert "raw template" in output_attr
Expand Down
16 changes: 14 additions & 2 deletions nbconvert/exporters/markdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,20 @@ def _raw_mimetypes_default(self):
def default_config(self):
c = Config(
{
"ExtractAttachmentsPreprocessor": {"enabled": True},
"ExtractOutputPreprocessor": {"enabled": True},
"Base64ImageExtractor": {"enabled": True, "use_separate_dir": False},
"ExtractOutputPreprocessor": {
"enabled": True,
"output_filename_template": "{unique_key}_{cell_index}_{index}_{timestamp}{extension}",
},
"ExtractAttachmentsPreprocessor": {"enabled": True, "use_separate_dir": False},
"FilesWriter": {"build_directory": "", "files_dir_template": "{unique_key}_files"},
"Exporter": {
"preprocessors": [
"nbconvert.preprocessors.Base64ImageExtractor",
"nbconvert.preprocessors.ExtractAttachmentsPreprocessor",
"nbconvert.preprocessors.ExtractOutputPreprocessor",
]
},
"NbConvertBase": {
"display_data_priority": [
"text/html",
Expand Down
2 changes: 2 additions & 0 deletions nbconvert/preprocessors/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from .csshtmlheader import CSSHTMLHeaderPreprocessor
from .execute import ExecutePreprocessor
from .extractattachments import ExtractAttachmentsPreprocessor
from .extractbase64images import Base64ImageExtractor
from .extractoutput import ExtractOutputPreprocessor
from .highlightmagics import HighlightMagicsPreprocessor
from .latex import LatexPreprocessor
Expand All @@ -33,4 +34,5 @@
"RegexRemovePreprocessor",
"SVG2PDFPreprocessor",
"TagRemovePreprocessor",
"Base64ImageExtractor", # Add this line
]
99 changes: 99 additions & 0 deletions nbconvert/preprocessors/extractbase64images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""
Preprocessor to extract base64 encoded images from markdown cells and save them to files.
"""

import os
import re
from base64 import b64decode
from typing import Any, Dict, Tuple
from uuid import uuid4

from nbformat.notebooknode import NotebookNode
from traitlets import Bool, Set, Unicode

from nbconvert.preprocessors import Preprocessor


class Base64ImageExtractor(Preprocessor):
"""
A preprocessor to extract base64-encoded images from Markdown cells and save them as files.
"""

use_separate_dir = Bool(
False, help="Whether to use a separate directory for base64 images"
).tag(config=True)

output_directory_template = Unicode(
"{notebook_name}_files", help="Directory to place base64 images if use_separate_dir is True"
).tag(config=True)

supported_image_types = Set(
{"png", "jpeg", "jpg", "gif", "bmp", "svg"},
help="Set of supported image types for extraction",
).tag(config=True)

def __init__(self, **kw):
super().__init__(**kw)
self.path_name = ""
self.resources_item_key = "base64_images" # default value

def preprocess(
self, nb: NotebookNode, resources: Dict[str, Any]
) -> Tuple[NotebookNode, Dict[str, Any]]:
"""
Preprocess the notebook and initialize the output directory for base64 images.
"""
if not isinstance(resources, dict):
raise TypeError("Resources must be a dictionary")

if self.use_separate_dir:
self.path_name = self.output_directory_template.format(
notebook_name=resources.get("unique_key", "notebook")
)
resources["base64_images_dir"] = self.path_name
resources.setdefault("base64_images", {})
self.resources_item_key = "base64_images"
else:
self.path_name = resources.get("output_files_dir", "output")
self.resources_item_key = "outputs"

# Initialize the resources dict if needed
resources.setdefault(self.resources_item_key, {})

return super().preprocess(nb, resources)

def preprocess_cell(
self, cell: NotebookNode, resources: Dict[str, Any], index: int
) -> Tuple[NotebookNode, Dict[str, Any]]:
"""
Extract base64 images from Markdown cells and save them to files.
"""
if cell.cell_type != "markdown":
return cell, resources

pattern = r"!\[([^\]]*)\]\(data:image/([^;]+);base64,([^\)]+)\)"

def replace_base64(match: re.Match) -> str:
alt_text, img_type, b64_data = match.groups()
if img_type.lower() not in self.supported_image_types:
self.log.warning(f"Unsupported image type: {img_type}")
return match.group(0)

try:
img_data = b64decode(b64_data.encode("utf-8"))
filename = f"base64_image_{index}_{uuid4().hex[:8]}.{img_type}"
filepath = os.path.join(self.path_name, filename)

# Store for FilesWriter
resources[self.resources_item_key][filepath] = img_data

# Return updated markdown with new image reference
if os.path.sep != "/":
filepath = filepath.replace(os.path.sep, "/")
return f"![{alt_text}]({filepath})"
except Exception as e:
self.log.error(f"Failed to decode base64 image: {e}")
return match.group(0)

cell.source = re.sub(pattern, replace_base64, cell.source)
return cell, resources
9 changes: 8 additions & 1 deletion nbconvert/preprocessors/extractoutput.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,9 @@ def platform_utf_8_encode(data):
return data


import time


class ExtractOutputPreprocessor(Preprocessor):
"""
Extracts all of the outputs from the notebook file. The extracted
Expand Down Expand Up @@ -119,7 +122,11 @@ def preprocess_cell(self, cell, resources, cell_index):
filename += ext
else:
filename = self.output_filename_template.format(
unique_key=unique_key, cell_index=cell_index, index=index, extension=ext
unique_key=unique_key,
cell_index=cell_index,
index=index,
timestamp=str(int(time.time() * 1000)), # Add millisecond timestamp
extension=ext,
)

# On the cell, make the figure available via
Expand Down
43 changes: 43 additions & 0 deletions tests/preprocessors/files/notebook_with_base64.ipynb

Large diffs are not rendered by default.

167 changes: 167 additions & 0 deletions tests/preprocessors/test_extractbase64images.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
"""Tests for the Base64ImageExtractor preprocessor"""

import os
from base64 import b64encode

from nbconvert.preprocessors.extractbase64images import Base64ImageExtractor

from .base import PreprocessorTestsBase


class TestBase64ImageExtractor(PreprocessorTestsBase):
"""Contains test functions for extractbase64images.py"""

def build_preprocessor(self):
"""Make an instance of a preprocessor"""
preprocessor = Base64ImageExtractor()
preprocessor.enabled = True
return preprocessor

def test_constructor(self):
"""Can a Base64ImageExtractor be constructed?"""
self.build_preprocessor()

def test_base64_extraction(self):
"""Test the extraction of base64 images from markdown cells"""
# Create a test image
test_image = b"test_image_data"
b64_data = b64encode(test_image).decode("utf-8")

# Create notebook with base64 image
nb = self.build_notebook()
nb.cells[1].source = f"![test image](data:image/png;base64,{b64_data})"

# Setup resources
res = self.build_resources()
res["unique_key"] = "test_notebook"

# Run preprocessor
preprocessor = self.build_preprocessor()
nb, res = preprocessor(nb, res)

# Check if image was extracted
self.assertEqual(len(res["outputs"]), 1)

# Get the filename from the markdown
filename = nb.cells[1].source.split("(")[1].rstrip(")")

# Verify image data
self.assertIn(filename, res["outputs"])
self.assertEqual(res["outputs"][filename], test_image)

# Verify markdown was updated correctly
self.assertTrue(nb.cells[1].source.startswith("![test image]"))
self.assertTrue(nb.cells[1].source.endswith(".png)"))

def test_invalid_base64(self):
"""Test handling of invalid base64 data"""
nb = self.build_notebook()
nb.cells[1].source = "![test image](data:image/png;base64,invalid_data)"

res = self.build_resources()
preprocessor = self.build_preprocessor()
nb, res = preprocessor(nb, res)

# Should keep original content when base64 is invalid
self.assertEqual(nb.cells[1].source, "![test image](data:image/png;base64,invalid_data)")
self.assertEqual(len(res["outputs"]), 0)

def test_unsupported_image_type(self):
"""Test handling of unsupported image type"""
test_image = b"test_image_data"
b64_data = b64encode(test_image).decode("utf-8")

nb = self.build_notebook()
nb.cells[1].source = f"![test image](data:image/unsupported;base64,{b64_data})"

res = self.build_resources()
preprocessor = self.build_preprocessor()
nb, res = preprocessor(nb, res)

# Should keep original content for unsupported types
self.assertEqual(
nb.cells[1].source, f"![test image](data:image/unsupported;base64,{b64_data})"
)
self.assertEqual(len(res["outputs"]), 0)

def test_multiple_images(self):
"""Test handling of multiple images in one cell"""
test_image = b"test_image_data"
b64_data = b64encode(test_image).decode("utf-8")

nb = self.build_notebook()
nb.cells[1].source = (
f"![image1](data:image/png;base64,{b64_data})\n"
f"![image2](data:image/png;base64,{b64_data})"
)

res = self.build_resources()
preprocessor = self.build_preprocessor()
nb, res = preprocessor(nb, res)

# Should extract both images
self.assertEqual(len(res["outputs"]), 2)
self.assertTrue(all(".png" in line for line in nb.cells[1].source.split("\n")))

def test_separate_dir(self):
"""Test extraction with separate directory option"""
preprocessor = self.build_preprocessor()
preprocessor.use_separate_dir = True

test_image = b"test_image_data"
b64_data = b64encode(test_image).decode("utf-8")

nb = self.build_notebook()
nb.cells[1].source = f"![test image](data:image/png;base64,{b64_data})"

res = self.build_resources()
res["unique_key"] = "test_notebook"

nb, res = preprocessor(nb, res)

# Verify directory structure
self.assertIn("base64_images_dir", res)
self.assertEqual(res["base64_images_dir"], "test_notebook_files")

# Verify image extraction
self.assertEqual(len(res["base64_images"]), 1)
filename = nb.cells[1].source.split("(")[1].rstrip(")")
self.assertTrue(filename.startswith("test_notebook_files/"))

def test_real_notebook(self):
"""Test extraction with a real notebook containing base64 images"""
# Load the test notebook
import nbformat

with open("tests/preprocessors/files/notebook_with_base64.ipynb") as f:
nb = nbformat.read(f, as_version=4)

# Setup resources
res = self.build_resources()
res["unique_key"] = "notebook_with_base64"

# Run preprocessor
preprocessor = self.build_preprocessor()
preprocessor.use_separate_dir = True
nb, res = preprocessor(nb, res)

# Verify images were extracted
self.assertGreater(len(res["base64_images"]), 0)
print(f"\nNumber of images extracted: {len(res['base64_images'])}")
print(f"Image filenames: {list(res['base64_images'].keys())}")

# Verify all extracted files are valid image files
for filename in res["base64_images"].keys():
self.assertTrue(
any(filename.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg"])
)

# Verify all image references in markdown cells are updated
for cell in nb.cells:
if cell.cell_type == "markdown":
# Check specifically for base64 image data, not just the word 'base64'
self.assertNotIn("data:image/png;base64,", cell.source)
self.assertNotIn("data:image/jpeg;base64,", cell.source)
self.assertNotIn("data:image/jpg;base64,", cell.source)
self.assertNotIn("data:image/gif;base64,", cell.source)
self.assertNotIn("data:image/svg;base64,", cell.source)
Loading