langchain/docs/scripts/notebook_convert.py

import multiprocessing
import os
import re
import sys
from pathlib import Path
from typing import Iterable, Tuple

import nbformat
from nbconvert.exporters import MarkdownExporter
from nbconvert.preprocessors import Preprocessor

HIDE_IN_NB_MAGIC_OPEN = "<!-- HIDE_IN_NB"
HIDE_IN_NB_MAGIC_CLOSE = "HIDE_IN_NB -->"


class EscapePreprocessor(Preprocessor):
    def preprocess_cell(self, cell, resources, index):
        if cell.cell_type == "markdown":
            # rewrite .ipynb links to .md
            cell.source = re.sub(
                r"\[([^\]]*)\]\((?![^\)]*//)([^)]*)\.ipynb\)",
                r"[\1](\2.md)",
                cell.source,
            )

        elif cell.cell_type == "code":
            # escape ``` in code
            cell.source = cell.source.replace("```", r"\`\`\`")
            # escape ``` in output

            # allow overriding title based on comment at beginning of cell
            if cell.source.startswith("# title="):
                lines = cell.source.split("\n")
                title = lines[0].split("# title=")[1]
                if title.startswith('"') and title.endswith('"'):
                    title = title[1:-1]
                cell.metadata["title"] = title
                cell.source = "\n".join(lines[1:])

            if "outputs" in cell:
                filter_out = set()
                for i, output in enumerate(cell["outputs"]):
                    if "text" in output:
                        if not output["text"].strip():
                            filter_out.add(i)
                            continue
                        output["text"] = output["text"].replace("```", r"\`\`\`")
                    elif "data" in output:
                        for key, value in output["data"].items():
                            if isinstance(value, str):
                                output["data"][key] = value.replace("```", r"\`\`\`")
                cell["outputs"] = [
                    output
                    for i, output in enumerate(cell["outputs"])
                    if i not in filter_out
                ]

        return cell, resources


class ExtractAttachmentsPreprocessor(Preprocessor):
    """
    Extracts all of the outputs from the notebook file.  The extracted
    outputs are returned in the 'resources' dictionary.
    """

    def preprocess_cell(self, cell, resources, index):
        """
        Apply a transformation on each cell,
        Parameters
        ----------
        cell : NotebookNode cell
            Notebook cell being processed
        resources : dictionary
            Additional resources used in the conversion process.  Allows
            preprocessors to pass variables into the Jinja engine.
        cell_index : int
            Index of the cell being processed (see base.py)
        """

        # Get files directory if it has been specified

        # Make sure outputs key exists
        if not isinstance(resources["outputs"], dict):
            resources["outputs"] = {}

        # Loop through all of the attachments in the cell
        for name, attach in cell.get("attachments", {}).items():
            for mime, data in attach.items():
                if mime not in {
                    "image/png",
                    "image/jpeg",
                    "image/svg+xml",
                    "application/pdf",
                }:
                    continue

                # attachments are pre-rendered. Only replace markdown-formatted
                # images with the following logic
                attach_str = f"({name})"
                if attach_str in cell.source:
                    data = f"(data:{mime};base64,{data})"
                    cell.source = cell.source.replace(attach_str, data)

        return cell, resources


class CustomRegexRemovePreprocessor(Preprocessor):
    def check_conditions(self, cell):
        pattern = re.compile(r"(?s)(?:\s*\Z)|(?:.*#\s*\|\s*output:\s*false.*)")
        rtn = not pattern.match(cell.source)
        if not rtn:
            return False
        else:
            return True

    def preprocess(self, nb, resources):
        nb.cells = [cell for cell in nb.cells if self.check_conditions(cell)]

        return nb, resources


class UnHidePreprocessor(Preprocessor):
    def preprocess_cell(self, cell, resources, index):
        cell.source = cell.source.replace(HIDE_IN_NB_MAGIC_OPEN, "")
        cell.source = cell.source.replace(HIDE_IN_NB_MAGIC_CLOSE, "")
        return cell, resources


exporter = MarkdownExporter(
    preprocessors=[
        EscapePreprocessor,
        ExtractAttachmentsPreprocessor,
        CustomRegexRemovePreprocessor,
        UnHidePreprocessor,
    ],
    template_name="mdoutput",
    extra_template_basedirs=["./scripts/notebook_convert_templates"],
)


def _process_path(tup: Tuple[Path, Path, Path]):
    notebook_path, intermediate_docs_dir, output_docs_dir = tup
    relative = notebook_path.relative_to(intermediate_docs_dir)
    output_path = output_docs_dir / relative.parent / (relative.stem + ".md")
    _convert_notebook(notebook_path, output_path, intermediate_docs_dir)


def _modify_frontmatter(
    body: str, notebook_path: Path, intermediate_docs_dir: Path
) -> str:
    # if frontmatter exists
    rel_path = notebook_path.relative_to(intermediate_docs_dir).as_posix()
    edit_url = (
        f"https://github.com/langchain-ai/langchain/edit/master/docs/docs/{rel_path}"
    )
    frontmatter = {
        "custom_edit_url": edit_url,
    }
    if re.match(r"^[\s\n]*---\n", body):
        # frontmatter already present

        for k, v in frontmatter.items():
            # if key already exists, leave it
            if re.match(f"{k}: ", body):
                continue
            else:
                body = re.sub(r"^[\s\n]*---\n", f"---\n{k}: {v}\n", body, count=1)
        return body
    else:
        insert = "\n".join([f"{k}: {v}" for k, v in frontmatter.items()])
        return f"---\n{insert}\n---\n{body}"


def _convert_notebook(
    notebook_path: Path, output_path: Path, intermediate_docs_dir: Path
) -> Path:
    import json
    import uuid

    with open(notebook_path, "r", encoding="utf-8") as f:
        nb_json = json.load(f)

    # Fix missing and duplicate cell IDs before nbformat validation
    seen_ids = set()
    for cell in nb_json.get("cells", []):
        if "id" not in cell or not cell.get("id") or cell.get("id") in seen_ids:
            cell["id"] = str(uuid.uuid4())[:8]
        seen_ids.add(cell["id"])

    nb = nbformat.reads(json.dumps(nb_json), as_version=4)

    # Upgrade notebook format
    nb = nbformat.v4.upgrade(nb)

    body, resources = exporter.from_notebook_node(nb)

    body = _modify_frontmatter(body, notebook_path, intermediate_docs_dir)

    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, "w") as f:
        f.write(body)

    return output_path


if __name__ == "__main__":
    intermediate_docs_dir = Path(sys.argv[1])
    output_docs_dir = Path(sys.argv[2])

    source_paths_arg = os.environ.get("SOURCE_PATHS")
    source_paths: Iterable[Path]
    if source_paths_arg:
        source_path_strs = re.split(r"\s+", source_paths_arg)
        source_paths_stripped = [p.strip() for p in source_path_strs]
        source_paths = [intermediate_docs_dir / p for p in source_paths_stripped if p]
    else:
        original_paths = list(intermediate_docs_dir.glob("**/*.ipynb"))
        # exclude files that exist in output directory and are newer
        relative_paths = [p.relative_to(intermediate_docs_dir) for p in original_paths]
        out_paths = [
            output_docs_dir / p.parent / (p.stem + ".md") for p in relative_paths
        ]
        source_paths = [
            p
            for p, o in zip(original_paths, out_paths)
            if not o.exists() or o.stat().st_mtime < p.stat().st_mtime
        ]
        print(f"rebuilding {len(source_paths)}/{len(relative_paths)} notebooks")

    with multiprocessing.Pool() as pool:
        pool.map(
            _process_path,
            (
                (notebook_path, intermediate_docs_dir, output_docs_dir)
                for notebook_path in source_paths
            ),
        )