import multiprocessing import os import re import sys from pathlib import Path from typing import Iterable, Tuple, List import tqdm import nbformat from nbconvert.exporters import MarkdownExporter from nbconvert.preprocessors import Preprocessor class EscapePreprocessor(Preprocessor): def preprocess_cell(self, cell, resources, cell_index): if cell.cell_type == "markdown": # find all occurrences of ```{=mdx} blocks and remove wrapper if "```{=mdx}\n" in cell.source: cell.source = re.sub( r"```{=mdx}\n(.*?)\n```", r"\1", cell.source, flags=re.DOTALL ) if ":::{.callout" in cell.source: cell.source = re.sub( r":::{.callout-([^}]*)}(.*?):::", r":::\1\2:::", cell.source, flags=re.DOTALL, ) # rewrite .ipynb links to .md cell.source = re.sub( r"\[([^\]]*)\]\((?![^\)]*//)([^)]*)\.ipynb\)", r"[\1](\2.md)", cell.source, ) return cell, resources class ExtractAttachmentsPreprocessor(Preprocessor): """ Extracts all of the outputs from the notebook file. The extracted outputs are returned in the 'resources' dictionary. """ def preprocess_cell(self, cell, resources, cell_index): """ Apply a transformation on each cell, Parameters ---------- cell : NotebookNode cell Notebook cell being processed resources : dictionary Additional resources used in the conversion process. Allows preprocessors to pass variables into the Jinja engine. cell_index : int Index of the cell being processed (see base.py) """ # Get files directory if it has been specified # Make sure outputs key exists if not isinstance(resources["outputs"], dict): resources["outputs"] = {} # Loop through all of the attachments in the cell for name, attach in cell.get("attachments", {}).items(): for mime, data in attach.items(): if mime not in { "image/png", "image/jpeg", "image/svg+xml", "application/pdf", }: continue # attachments are pre-rendered. Only replace markdown-formatted # images with the following logic attach_str = f"({name})" if attach_str in cell.source: data = f"(data:{mime};base64,{data})" cell.source = cell.source.replace(attach_str, data) return cell, resources class CustomRegexRemovePreprocessor(Preprocessor): def check_conditions(self, cell): pattern = re.compile(r"(?s)(?:\s*\Z)|(?:.*#\s*\|\s*output:\s*false.*)") rtn = not pattern.match(cell.source) if not rtn: return False else: return True def preprocess(self, nb, resources): nb.cells = [cell for cell in nb.cells if self.check_conditions(cell)] return nb, resources exporter = MarkdownExporter( preprocessors=[ EscapePreprocessor, ExtractAttachmentsPreprocessor, CustomRegexRemovePreprocessor, ], template_name="mdoutput", extra_template_basedirs=["./scripts/notebook_convert_templates"], ) def _process_path(tup: Tuple[Path, Path, Path]): notebook_path, intermediate_docs_dir, output_docs_dir = tup relative = notebook_path.relative_to(intermediate_docs_dir) output_path = output_docs_dir / relative.parent / (relative.stem + ".md") _convert_notebook(notebook_path, output_path, intermediate_docs_dir) def _modify_frontmatter( body: str, notebook_path: Path, intermediate_docs_dir: Path ) -> str: # if frontmatter exists rel_path = notebook_path.relative_to(intermediate_docs_dir).as_posix() edit_url = ( f"https://github.com/langchain-ai/langchain/edit/master/docs/docs/{rel_path}" ) if re.match(r"^[\s\n]*---\n", body): # if custom_edit_url already exists, leave it if re.match(r"custom_edit_url: ", body): return body else: return re.sub( r"^[\s\n]*---\n", f"---\ncustom_edit_url: {edit_url}\n", body, count=1 ) else: return f"---\ncustom_edit_url: {edit_url}\n---\n{body}" def _convert_notebook( notebook_path: Path, output_path: Path, intermediate_docs_dir: Path ) -> Path: with open(notebook_path) as f: nb = nbformat.read(f, as_version=4) body, resources = exporter.from_notebook_node(nb) body = _modify_frontmatter(body, notebook_path, intermediate_docs_dir) output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, "w") as f: f.write(body) return output_path if __name__ == "__main__": intermediate_docs_dir = Path(sys.argv[1]) output_docs_dir = Path(sys.argv[2]) source_paths_arg = os.environ.get("SOURCE_PATHS") source_paths: List[Path] if source_paths_arg: source_path_strs = re.split(r"\s+", source_paths_arg) source_paths_stripped = [p.strip() for p in source_path_strs] source_paths = [intermediate_docs_dir / p for p in source_paths_stripped if p] else: source_paths = list(intermediate_docs_dir.glob("**/*.ipynb")) with multiprocessing.Pool() as pool: list( tqdm.tqdm( pool.imap( _process_path, ( (notebook_path, intermediate_docs_dir, output_docs_dir) for notebook_path in source_paths ), ), total=len(source_paths), ) )