infra: use nbconvert for docs build (#21135)

todo - [x] remove quarto build semantics - [x] remove quarto download/install - [x] make `uv` not verbose
2025-09-05 04:55:14 +00:00 · 2024-05-07 12:30:17 -07:00
parent ad0f3c14c2
commit d5bde4fa91
14 changed files with 262 additions and 216 deletions
--- a/docs/scripts/generate_api_reference_links.py
+++ b/docs/scripts/generate_api_reference_links.py
@@ -185,8 +185,8 @@ def replace_imports(file):
    # Use re.sub to replace each Python code block
    data = code_block_re.sub(replacer, data)

-    if all_imports:
-        print(f"Adding {len(all_imports)} links for imports in {file}")  # noqa: T201
+    # if all_imports:
+    #     print(f"Adding {len(all_imports)} links for imports in {file}")  # noqa: T201
    with open(file, "w") as f:
        f.write(data)
    return all_imports
--- a/docs/scripts/notebook_convert.py
+++ b/docs/scripts/notebook_convert.py
@@ -0,0 +1,130 @@
+import multiprocessing
+import os
+import re
+import sys
+from pathlib import Path
+from typing import Iterable, Tuple
+
+import nbformat
+from nbconvert.exporters import MarkdownExporter
+from nbconvert.preprocessors import Preprocessor, RegexRemovePreprocessor
+
+
+class EscapePreprocessor(Preprocessor):
+    def preprocess_cell(self, cell, resources, cell_index):
+        if cell.cell_type == "markdown":
+            # find all occurrences of ```{=mdx} blocks and remove wrapper
+            if "```{=mdx}\n" in cell.source:
+                cell.source = re.sub(
+                    r"```{=mdx}\n(.*?)\n```", r"\1", cell.source, flags=re.DOTALL
+                )
+            if ":::{.callout" in cell.source:
+                cell.source = re.sub(
+                    r":::{.callout-([^}]*)}(.*?):::",
+                    r":::\1\2:::",
+                    cell.source,
+                    flags=re.DOTALL,
+                )
+        return cell, resources
+
+
+class ExtractAttachmentsPreprocessor(Preprocessor):
+    """
+    Extracts all of the outputs from the notebook file.  The extracted
+    outputs are returned in the 'resources' dictionary.
+    """
+
+    def preprocess_cell(self, cell, resources, cell_index):
+        """
+        Apply a transformation on each cell,
+        Parameters
+        ----------
+        cell : NotebookNode cell
+            Notebook cell being processed
+        resources : dictionary
+            Additional resources used in the conversion process.  Allows
+            preprocessors to pass variables into the Jinja engine.
+        cell_index : int
+            Index of the cell being processed (see base.py)
+        """
+
+        # Get files directory if it has been specified
+
+        # Make sure outputs key exists
+        if not isinstance(resources["outputs"], dict):
+            resources["outputs"] = {}
+
+        # Loop through all of the attachments in the cell
+        for name, attach in cell.get("attachments", {}).items():
+            for mime, data in attach.items():
+                if mime not in {
+                    "image/png",
+                    "image/jpeg",
+                    "image/svg+xml",
+                    "application/pdf",
+                }:
+                    continue
+
+                # attachments are pre-rendered. Only replace markdown-formatted
+                # images with the following logic
+                attach_str = f"({name})"
+                if attach_str in cell.source:
+                    data = f"(data:{mime};base64,{data})"
+                    cell.source = cell.source.replace(attach_str, data)
+
+        return cell, resources
+
+
+exporter = MarkdownExporter(
+    preprocessors=[
+        EscapePreprocessor,
+        ExtractAttachmentsPreprocessor,
+        RegexRemovePreprocessor(patterns=[r"^\s*$"]),
+    ],
+    template_name="mdoutput",
+    extra_template_basedirs=["./scripts/notebook_convert_templates"],
+)
+
+
+def _process_path(tup: Tuple[Path, Path, Path]):
+    notebook_path, intermediate_docs_dir, output_docs_dir = tup
+    relative = notebook_path.relative_to(intermediate_docs_dir)
+    output_path = output_docs_dir / relative.parent / (relative.stem + ".md")
+    _convert_notebook(notebook_path, output_path)
+
+
+def _convert_notebook(notebook_path: Path, output_path: Path):
+    with open(notebook_path) as f:
+        nb = nbformat.read(f, as_version=4)
+
+    body, resources = exporter.from_notebook_node(nb)
+
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    with open(output_path, "w") as f:
+        f.write(body)
+
+    return output_path
+
+
+if __name__ == "__main__":
+    intermediate_docs_dir = Path(sys.argv[1])
+    output_docs_dir = Path(sys.argv[2])
+
+    source_paths_arg = os.environ.get("SOURCE_PATHS")
+    source_paths: Iterable[Path]
+    if source_paths_arg:
+        source_path_strs = re.split(r"\s+", source_paths_arg)
+        source_paths_stripped = [p.strip() for p in source_path_strs]
+        source_paths = [intermediate_docs_dir / p for p in source_paths_stripped if p]
+    else:
+        source_paths = intermediate_docs_dir.glob("**/*.ipynb")
+
+    with multiprocessing.Pool() as pool:
+        pool.map(
+            _process_path,
+            (
+                (notebook_path, intermediate_docs_dir, output_docs_dir)
+                for notebook_path in source_paths
+            ),
+        )
--- a/docs/scripts/notebook_convert_templates/mdoutput/conf.json
+++ b/docs/scripts/notebook_convert_templates/mdoutput/conf.json
@@ -0,0 +1,5 @@
+{
+  "mimetypes": {
+    "text/markdown": true
+  }
+}
--- a/docs/scripts/notebook_convert_templates/mdoutput/index.md.j2
+++ b/docs/scripts/notebook_convert_templates/mdoutput/index.md.j2
@@ -0,0 +1,33 @@
+{% extends 'markdown/index.md.j2' %}
+
+{%- block traceback_line -%}
+```output
+{{ line.rstrip() | strip_ansi }}
+```
+{%- endblock traceback_line -%}
+
+{%- block stream -%}
+```output
+{{ output.text.rstrip() }}
+```
+{%- endblock stream -%}
+
+{%- block data_text scoped -%}
+```output
+{{ output.data['text/plain'].rstrip() }}
+```
+{%- endblock data_text -%}
+
+{%- block data_html scoped -%}
+```html
+{{ output.data['text/html'] | safe }} 
+```
+{%- endblock data_html -%}
+
+{%- block data_jpg scoped -%}
+![](data:image/jpg;base64,{{ output.data['image/jpeg'] }})
+{%- endblock data_jpg -%}
+
+{%- block data_png scoped -%}
+![](data:image/png;base64,{{ output.data['image/png'] }})
+{%- endblock data_png -%}