jupyter · untko · Feb 14, 2025 · Feb 14, 2025 · Feb 14, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1634,6 +1634,7 @@ raw template
 {%- endblock in_prompt -%}
     """
 
+
 exporter_attr = AttrExporter()
 output_attr, _ = exporter_attr.from_notebook_node(nb)
 assert "raw template" in output_attr

diff --git a/nbconvert/exporters/markdown.py b/nbconvert/exporters/markdown.py
@@ -34,8 +34,20 @@ def _raw_mimetypes_default(self):
     def default_config(self):
         c = Config(
             {
-                "ExtractAttachmentsPreprocessor": {"enabled": True},
-                "ExtractOutputPreprocessor": {"enabled": True},
+                "Base64ImageExtractor": {"enabled": True, "use_separate_dir": False},
+                "ExtractOutputPreprocessor": {
+                    "enabled": True,
+                    "output_filename_template": "{unique_key}_{cell_index}_{index}_{timestamp}{extension}",
+                },
+                "ExtractAttachmentsPreprocessor": {"enabled": True, "use_separate_dir": False},
+                "FilesWriter": {"build_directory": "", "files_dir_template": "{unique_key}_files"},
+                "Exporter": {
+                    "preprocessors": [
+                        "nbconvert.preprocessors.Base64ImageExtractor",
+                        "nbconvert.preprocessors.ExtractAttachmentsPreprocessor",
+                        "nbconvert.preprocessors.ExtractOutputPreprocessor",
+                    ]
+                },
                 "NbConvertBase": {
                     "display_data_priority": [
                         "text/html",

diff --git a/nbconvert/preprocessors/__init__.py b/nbconvert/preprocessors/__init__.py
@@ -10,6 +10,7 @@
 from .csshtmlheader import CSSHTMLHeaderPreprocessor
 from .execute import ExecutePreprocessor
 from .extractattachments import ExtractAttachmentsPreprocessor
+from .extractbase64images import Base64ImageExtractor
 from .extractoutput import ExtractOutputPreprocessor
 from .highlightmagics import HighlightMagicsPreprocessor
 from .latex import LatexPreprocessor
@@ -33,4 +34,5 @@
     "RegexRemovePreprocessor",
     "SVG2PDFPreprocessor",
     "TagRemovePreprocessor",
+    "Base64ImageExtractor",  # Add this line
 ]
diff --git a/nbconvert/preprocessors/extractbase64images.py b/nbconvert/preprocessors/extractbase64images.py
@@ -0,0 +1,99 @@
+"""
+Preprocessor to extract base64 encoded images from markdown cells and save them to files.
+"""
+
+import os
+import re
+from base64 import b64decode
+from typing import Any, Dict, Tuple
+from uuid import uuid4
+
+from nbformat.notebooknode import NotebookNode
+from traitlets import Bool, Set, Unicode
+
+from nbconvert.preprocessors import Preprocessor
+
+
+class Base64ImageExtractor(Preprocessor):
+    """
+    A preprocessor to extract base64-encoded images from Markdown cells and save them as files.
+    """
+
+    use_separate_dir = Bool(
+        False, help="Whether to use a separate directory for base64 images"
+    ).tag(config=True)
+
+    output_directory_template = Unicode(
+        "{notebook_name}_files", help="Directory to place base64 images if use_separate_dir is True"
+    ).tag(config=True)
+
+    supported_image_types = Set(
+        {"png", "jpeg", "jpg", "gif", "bmp", "svg"},
+        help="Set of supported image types for extraction",
+    ).tag(config=True)
+
+    def __init__(self, **kw):
+        super().__init__(**kw)
+        self.path_name = ""
+        self.resources_item_key = "base64_images"  # default value
+
+    def preprocess(
+        self, nb: NotebookNode, resources: Dict[str, Any]
+    ) -> Tuple[NotebookNode, Dict[str, Any]]:
+        """
+        Preprocess the notebook and initialize the output directory for base64 images.
+        """
+        if not isinstance(resources, dict):
+            raise TypeError("Resources must be a dictionary")
+
+        if self.use_separate_dir:
+            self.path_name = self.output_directory_template.format(
+                notebook_name=resources.get("unique_key", "notebook")
+            )
+            resources["base64_images_dir"] = self.path_name
+            resources.setdefault("base64_images", {})
+            self.resources_item_key = "base64_images"
+        else:
+            self.path_name = resources.get("output_files_dir", "output")
+            self.resources_item_key = "outputs"
+
+        # Initialize the resources dict if needed
+        resources.setdefault(self.resources_item_key, {})
+
+        return super().preprocess(nb, resources)
+
+    def preprocess_cell(
+        self, cell: NotebookNode, resources: Dict[str, Any], index: int
+    ) -> Tuple[NotebookNode, Dict[str, Any]]:
+        """
+        Extract base64 images from Markdown cells and save them to files.
+        """
+        if cell.cell_type != "markdown":
+            return cell, resources
+
+        pattern = r"!\[([^\]]*)\]\(data:image/([^;]+);base64,([^\)]+)\)"
+
+        def replace_base64(match: re.Match) -> str:
+            alt_text, img_type, b64_data = match.groups()
+            if img_type.lower() not in self.supported_image_types:
+                self.log.warning(f"Unsupported image type: {img_type}")
+                return match.group(0)
+
+            try:
+                img_data = b64decode(b64_data.encode("utf-8"))
+                filename = f"base64_image_{index}_{uuid4().hex[:8]}.{img_type}"
+                filepath = os.path.join(self.path_name, filename)
+
+                # Store for FilesWriter
+                resources[self.resources_item_key][filepath] = img_data
+
+                # Return updated markdown with new image reference
+                if os.path.sep != "/":
+                    filepath = filepath.replace(os.path.sep, "/")
+                return f"![{alt_text}]({filepath})"
+            except Exception as e:
+                self.log.error(f"Failed to decode base64 image: {e}")
+                return match.group(0)
+
+        cell.source = re.sub(pattern, replace_base64, cell.source)
+        return cell, resources
diff --git a/nbconvert/preprocessors/extractoutput.py b/nbconvert/preprocessors/extractoutput.py
@@ -39,6 +39,9 @@ def platform_utf_8_encode(data):
     return data
 
 
+import time
+
+
 class ExtractOutputPreprocessor(Preprocessor):
     """
     Extracts all of the outputs from the notebook file.  The extracted
@@ -119,7 +122,11 @@ def preprocess_cell(self, cell, resources, cell_index):
                             filename += ext
                     else:
                         filename = self.output_filename_template.format(
-                            unique_key=unique_key, cell_index=cell_index, index=index, extension=ext
+                            unique_key=unique_key,
+                            cell_index=cell_index,
+                            index=index,
+                            timestamp=str(int(time.time() * 1000)),  # Add millisecond timestamp
+                            extension=ext,
                         )
 
                     # On the cell, make the figure available via

diff --git a/tests/preprocessors/files/notebook_with_base64.ipynb b/tests/preprocessors/files/notebook_with_base64.ipynb
diff --git a/tests/preprocessors/test_extractbase64images.py b/tests/preprocessors/test_extractbase64images.py
@@ -0,0 +1,167 @@
+"""Tests for the Base64ImageExtractor preprocessor"""
+
+import os
+from base64 import b64encode
+
+from nbconvert.preprocessors.extractbase64images import Base64ImageExtractor
+
+from .base import PreprocessorTestsBase
+
+
+class TestBase64ImageExtractor(PreprocessorTestsBase):
+    """Contains test functions for extractbase64images.py"""
+
+    def build_preprocessor(self):
+        """Make an instance of a preprocessor"""
+        preprocessor = Base64ImageExtractor()
+        preprocessor.enabled = True
+        return preprocessor
+
+    def test_constructor(self):
+        """Can a Base64ImageExtractor be constructed?"""
+        self.build_preprocessor()
+
+    def test_base64_extraction(self):
+        """Test the extraction of base64 images from markdown cells"""
+        # Create a test image
+        test_image = b"test_image_data"
+        b64_data = b64encode(test_image).decode("utf-8")
+
+        # Create notebook with base64 image
+        nb = self.build_notebook()
+        nb.cells[1].source = f"![test image](data:image/png;base64,{b64_data})"
+
+        # Setup resources
+        res = self.build_resources()
+        res["unique_key"] = "test_notebook"
+
+        # Run preprocessor
+        preprocessor = self.build_preprocessor()
+        nb, res = preprocessor(nb, res)
+
+        # Check if image was extracted
+        self.assertEqual(len(res["outputs"]), 1)
+
+        # Get the filename from the markdown
+        filename = nb.cells[1].source.split("(")[1].rstrip(")")
+
+        # Verify image data
+        self.assertIn(filename, res["outputs"])
+        self.assertEqual(res["outputs"][filename], test_image)
+
+        # Verify markdown was updated correctly
+        self.assertTrue(nb.cells[1].source.startswith("![test image]"))
+        self.assertTrue(nb.cells[1].source.endswith(".png)"))
+
+    def test_invalid_base64(self):
+        """Test handling of invalid base64 data"""
+        nb = self.build_notebook()
+        nb.cells[1].source = "![test image](data:image/png;base64,invalid_data)"
+
+        res = self.build_resources()
+        preprocessor = self.build_preprocessor()
+        nb, res = preprocessor(nb, res)
+
+        # Should keep original content when base64 is invalid
+        self.assertEqual(nb.cells[1].source, "![test image](data:image/png;base64,invalid_data)")
+        self.assertEqual(len(res["outputs"]), 0)
+
+    def test_unsupported_image_type(self):
+        """Test handling of unsupported image type"""
+        test_image = b"test_image_data"
+        b64_data = b64encode(test_image).decode("utf-8")
+
+        nb = self.build_notebook()
+        nb.cells[1].source = f"![test image](data:image/unsupported;base64,{b64_data})"
+
+        res = self.build_resources()
+        preprocessor = self.build_preprocessor()
+        nb, res = preprocessor(nb, res)
+
+        # Should keep original content for unsupported types
+        self.assertEqual(
+            nb.cells[1].source, f"![test image](data:image/unsupported;base64,{b64_data})"
+        )
+        self.assertEqual(len(res["outputs"]), 0)
+
+    def test_multiple_images(self):
+        """Test handling of multiple images in one cell"""
+        test_image = b"test_image_data"
+        b64_data = b64encode(test_image).decode("utf-8")
+
+        nb = self.build_notebook()
+        nb.cells[1].source = (
+            f"![image1](data:image/png;base64,{b64_data})\n"
+            f"![image2](data:image/png;base64,{b64_data})"
+        )
+
+        res = self.build_resources()
+        preprocessor = self.build_preprocessor()
+        nb, res = preprocessor(nb, res)
+
+        # Should extract both images
+        self.assertEqual(len(res["outputs"]), 2)
+        self.assertTrue(all(".png" in line for line in nb.cells[1].source.split("\n")))
+
+    def test_separate_dir(self):
+        """Test extraction with separate directory option"""
+        preprocessor = self.build_preprocessor()
+        preprocessor.use_separate_dir = True
+
+        test_image = b"test_image_data"
+        b64_data = b64encode(test_image).decode("utf-8")
+
+        nb = self.build_notebook()
+        nb.cells[1].source = f"![test image](data:image/png;base64,{b64_data})"
+
+        res = self.build_resources()
+        res["unique_key"] = "test_notebook"
+
+        nb, res = preprocessor(nb, res)
+
+        # Verify directory structure
+        self.assertIn("base64_images_dir", res)
+        self.assertEqual(res["base64_images_dir"], "test_notebook_files")
+
+        # Verify image extraction
+        self.assertEqual(len(res["base64_images"]), 1)
+        filename = nb.cells[1].source.split("(")[1].rstrip(")")
+        self.assertTrue(filename.startswith("test_notebook_files/"))
+
+    def test_real_notebook(self):
+        """Test extraction with a real notebook containing base64 images"""
+        # Load the test notebook
+        import nbformat
+
+        with open("tests/preprocessors/files/notebook_with_base64.ipynb") as f:
+            nb = nbformat.read(f, as_version=4)
+
+        # Setup resources
+        res = self.build_resources()
+        res["unique_key"] = "notebook_with_base64"
+
+        # Run preprocessor
+        preprocessor = self.build_preprocessor()
+        preprocessor.use_separate_dir = True
+        nb, res = preprocessor(nb, res)
+
+        # Verify images were extracted
+        self.assertGreater(len(res["base64_images"]), 0)
+        print(f"\nNumber of images extracted: {len(res['base64_images'])}")
+        print(f"Image filenames: {list(res['base64_images'].keys())}")
+
+        # Verify all extracted files are valid image files
+        for filename in res["base64_images"].keys():
+            self.assertTrue(
+                any(filename.endswith(ext) for ext in [".png", ".jpg", ".jpeg", ".gif", ".svg"])
+            )
+
+        # Verify all image references in markdown cells are updated
+        for cell in nb.cells:
+            if cell.cell_type == "markdown":
+                # Check specifically for base64 image data, not just the word 'base64'
+                self.assertNotIn("data:image/png;base64,", cell.source)
+                self.assertNotIn("data:image/jpeg;base64,", cell.source)
+                self.assertNotIn("data:image/jpg;base64,", cell.source)
+                self.assertNotIn("data:image/gif;base64,", cell.source)
+                self.assertNotIn("data:image/svg;base64,", cell.source)