langchain/docs/scripts/notebook_convert.py
Mason Daugherty f624ad489a
feat(docs): improve devx, fix Makefile targets (#32237)
**TL;DR much of the provided `Makefile` targets were broken, and any
time I wanted to preview changes locally I either had to refer to a
command Chester gave me or try waiting on a Vercel preview deployment.
With this PR, everything should behave like normal.**

Significant updates to the `Makefile` and documentation files, focusing
on improving usability, adding clear messaging, and fixing/enhancing
documentation workflows.

### Updates to `Makefile`:

#### Enhanced build and cleaning processes:
- Added informative messages (e.g., "📚 Building LangChain
documentation...") to makefile targets like `docs_build`, `docs_clean`,
and `api_docs_build` for better user feedback during execution.
- Introduced a `clean-cache` target to the `docs` `Makefile` to clear
cached dependencies and ensure clean builds.

#### Improved dependency handling:
- Modified `install-py-deps` to create a `.venv/deps_installed` marker,
preventing redundant/duplicate dependency installations and improving
efficiency.

#### Streamlined file generation and infrastructure setup:
- Added caching for the LangServe README download and parallelized
feature table generation
- Added user-friendly completion messages for targets like `copy-infra`
and `render`.

#### Documentation server updates:
- Enhanced the `start` target with messages indicating server start and
URL for local documentation viewing.

---

### Documentation Improvements:

#### Content clarity and consistency:
- Standardized section titles for consistency across documentation
files.
[[1]](diffhunk://#diff-9b1a85ea8a9dcf79f58246c88692cd7a36316665d7e05a69141cfdc50794c82aL1-R1)
[[2]](diffhunk://#diff-944008ad3a79d8a312183618401fcfa71da0e69c75803eff09b779fc8e03183dL1-R1)
- Refined phrasing and formatting in sections like "Dependency
management" and "Formatting and linting" for better readability.
[[1]](diffhunk://#diff-2069d4f956ab606ae6d51b191439283798adaf3a6648542c409d258131617059L6-R6)
[[2]](diffhunk://#diff-2069d4f956ab606ae6d51b191439283798adaf3a6648542c409d258131617059L84-R82)

#### Enhanced workflows:
- Updated instructions for building and viewing documentation locally,
including tips for specifying server ports and handling API reference
previews.
[[1]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L60-R94)
[[2]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L82-R126)
- Expanded guidance on cleaning documentation artifacts and using
linting tools effectively.
[[1]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L82-R126)
[[2]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L107-R142)

#### API reference documentation:
- Improved instructions for generating and formatting in-code
documentation, highlighting best practices for docstring writing.
[[1]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L107-R142)
[[2]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L144-R186)

---

### Minor Changes:
- Added support for a new package name (`langchain_v1`) in the API
documentation generation script.
- Fixed minor capitalization and formatting issues in documentation
files.
[[1]](diffhunk://#diff-2069d4f956ab606ae6d51b191439283798adaf3a6648542c409d258131617059L40-R40)
[[2]](diffhunk://#diff-2069d4f956ab606ae6d51b191439283798adaf3a6648542c409d258131617059L166-R160)

---------

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2025-07-25 14:49:03 -04:00

240 lines
8.0 KiB
Python

import multiprocessing
import os
import re
import sys
from pathlib import Path
from typing import Iterable, Tuple
import nbformat
from nbconvert.exporters import MarkdownExporter
from nbconvert.preprocessors import Preprocessor
HIDE_IN_NB_MAGIC_OPEN = "<!-- HIDE_IN_NB"
HIDE_IN_NB_MAGIC_CLOSE = "HIDE_IN_NB -->"
class EscapePreprocessor(Preprocessor):
def preprocess_cell(self, cell, resources, index):
if cell.cell_type == "markdown":
# rewrite .ipynb links to .md
cell.source = re.sub(
r"\[([^\]]*)\]\((?![^\)]*//)([^)]*)\.ipynb\)",
r"[\1](\2.md)",
cell.source,
)
elif cell.cell_type == "code":
# escape ``` in code
cell.source = cell.source.replace("```", r"\`\`\`")
# escape ``` in output
# allow overriding title based on comment at beginning of cell
if cell.source.startswith("# title="):
lines = cell.source.split("\n")
title = lines[0].split("# title=")[1]
if title.startswith('"') and title.endswith('"'):
title = title[1:-1]
cell.metadata["title"] = title
cell.source = "\n".join(lines[1:])
if "outputs" in cell:
filter_out = set()
for i, output in enumerate(cell["outputs"]):
if "text" in output:
if not output["text"].strip():
filter_out.add(i)
continue
output["text"] = output["text"].replace("```", r"\`\`\`")
elif "data" in output:
for key, value in output["data"].items():
if isinstance(value, str):
output["data"][key] = value.replace("```", r"\`\`\`")
cell["outputs"] = [
output
for i, output in enumerate(cell["outputs"])
if i not in filter_out
]
return cell, resources
class ExtractAttachmentsPreprocessor(Preprocessor):
"""
Extracts all of the outputs from the notebook file. The extracted
outputs are returned in the 'resources' dictionary.
"""
def preprocess_cell(self, cell, resources, index):
"""
Apply a transformation on each cell,
Parameters
----------
cell : NotebookNode cell
Notebook cell being processed
resources : dictionary
Additional resources used in the conversion process. Allows
preprocessors to pass variables into the Jinja engine.
cell_index : int
Index of the cell being processed (see base.py)
"""
# Get files directory if it has been specified
# Make sure outputs key exists
if not isinstance(resources["outputs"], dict):
resources["outputs"] = {}
# Loop through all of the attachments in the cell
for name, attach in cell.get("attachments", {}).items():
for mime, data in attach.items():
if mime not in {
"image/png",
"image/jpeg",
"image/svg+xml",
"application/pdf",
}:
continue
# attachments are pre-rendered. Only replace markdown-formatted
# images with the following logic
attach_str = f"({name})"
if attach_str in cell.source:
data = f"(data:{mime};base64,{data})"
cell.source = cell.source.replace(attach_str, data)
return cell, resources
class CustomRegexRemovePreprocessor(Preprocessor):
def check_conditions(self, cell):
pattern = re.compile(r"(?s)(?:\s*\Z)|(?:.*#\s*\|\s*output:\s*false.*)")
rtn = not pattern.match(cell.source)
if not rtn:
return False
else:
return True
def preprocess(self, nb, resources):
nb.cells = [cell for cell in nb.cells if self.check_conditions(cell)]
return nb, resources
class UnHidePreprocessor(Preprocessor):
def preprocess_cell(self, cell, resources, index):
cell.source = cell.source.replace(HIDE_IN_NB_MAGIC_OPEN, "")
cell.source = cell.source.replace(HIDE_IN_NB_MAGIC_CLOSE, "")
return cell, resources
exporter = MarkdownExporter(
preprocessors=[
EscapePreprocessor,
ExtractAttachmentsPreprocessor,
CustomRegexRemovePreprocessor,
UnHidePreprocessor,
],
template_name="mdoutput",
extra_template_basedirs=["./scripts/notebook_convert_templates"],
)
def _process_path(tup: Tuple[Path, Path, Path]):
notebook_path, intermediate_docs_dir, output_docs_dir = tup
relative = notebook_path.relative_to(intermediate_docs_dir)
output_path = output_docs_dir / relative.parent / (relative.stem + ".md")
_convert_notebook(notebook_path, output_path, intermediate_docs_dir)
def _modify_frontmatter(
body: str, notebook_path: Path, intermediate_docs_dir: Path
) -> str:
# if frontmatter exists
rel_path = notebook_path.relative_to(intermediate_docs_dir).as_posix()
edit_url = (
f"https://github.com/langchain-ai/langchain/edit/master/docs/docs/{rel_path}"
)
frontmatter = {
"custom_edit_url": edit_url,
}
if re.match(r"^[\s\n]*---\n", body):
# frontmatter already present
for k, v in frontmatter.items():
# if key already exists, leave it
if re.match(f"{k}: ", body):
continue
else:
body = re.sub(r"^[\s\n]*---\n", f"---\n{k}: {v}\n", body, count=1)
return body
else:
insert = "\n".join([f"{k}: {v}" for k, v in frontmatter.items()])
return f"---\n{insert}\n---\n{body}"
def _convert_notebook(
notebook_path: Path, output_path: Path, intermediate_docs_dir: Path
) -> Path:
import json
import uuid
with open(notebook_path, "r", encoding="utf-8") as f:
nb_json = json.load(f)
# Fix missing and duplicate cell IDs before nbformat validation
seen_ids = set()
for cell in nb_json.get("cells", []):
if "id" not in cell or not cell.get("id") or cell.get("id") in seen_ids:
cell["id"] = str(uuid.uuid4())[:8]
seen_ids.add(cell["id"])
nb = nbformat.reads(json.dumps(nb_json), as_version=4)
# Upgrade notebook format
nb = nbformat.v4.upgrade(nb)
body, resources = exporter.from_notebook_node(nb)
body = _modify_frontmatter(body, notebook_path, intermediate_docs_dir)
output_path.parent.mkdir(parents=True, exist_ok=True)
with open(output_path, "w") as f:
f.write(body)
return output_path
if __name__ == "__main__":
intermediate_docs_dir = Path(sys.argv[1])
output_docs_dir = Path(sys.argv[2])
source_paths_arg = os.environ.get("SOURCE_PATHS")
source_paths: Iterable[Path]
if source_paths_arg:
source_path_strs = re.split(r"\s+", source_paths_arg)
source_paths_stripped = [p.strip() for p in source_path_strs]
source_paths = [intermediate_docs_dir / p for p in source_paths_stripped if p]
else:
original_paths = list(intermediate_docs_dir.glob("**/*.ipynb"))
# exclude files that exist in output directory and are newer
relative_paths = [p.relative_to(intermediate_docs_dir) for p in original_paths]
out_paths = [
output_docs_dir / p.parent / (p.stem + ".md") for p in relative_paths
]
source_paths = [
p
for p, o in zip(original_paths, out_paths)
if not o.exists() or o.stat().st_mtime < p.stat().st_mtime
]
print(f"rebuilding {len(source_paths)}/{len(relative_paths)} notebooks")
with multiprocessing.Pool() as pool:
pool.map(
_process_path,
(
(notebook_path, intermediate_docs_dir, output_docs_dir)
for notebook_path in source_paths
),
)