mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-07 03:56:39 +00:00
**TL;DR much of the provided `Makefile` targets were broken, and any
time I wanted to preview changes locally I either had to refer to a
command Chester gave me or try waiting on a Vercel preview deployment.
With this PR, everything should behave like normal.**
Significant updates to the `Makefile` and documentation files, focusing
on improving usability, adding clear messaging, and fixing/enhancing
documentation workflows.
### Updates to `Makefile`:
#### Enhanced build and cleaning processes:
- Added informative messages (e.g., "📚 Building LangChain
documentation...") to makefile targets like `docs_build`, `docs_clean`,
and `api_docs_build` for better user feedback during execution.
- Introduced a `clean-cache` target to the `docs` `Makefile` to clear
cached dependencies and ensure clean builds.
#### Improved dependency handling:
- Modified `install-py-deps` to create a `.venv/deps_installed` marker,
preventing redundant/duplicate dependency installations and improving
efficiency.
#### Streamlined file generation and infrastructure setup:
- Added caching for the LangServe README download and parallelized
feature table generation
- Added user-friendly completion messages for targets like `copy-infra`
and `render`.
#### Documentation server updates:
- Enhanced the `start` target with messages indicating server start and
URL for local documentation viewing.
---
### Documentation Improvements:
#### Content clarity and consistency:
- Standardized section titles for consistency across documentation
files.
[[1]](diffhunk://#diff-9b1a85ea8a9dcf79f58246c88692cd7a36316665d7e05a69141cfdc50794c82aL1-R1)
[[2]](diffhunk://#diff-944008ad3a79d8a312183618401fcfa71da0e69c75803eff09b779fc8e03183dL1-R1)
- Refined phrasing and formatting in sections like "Dependency
management" and "Formatting and linting" for better readability.
[[1]](diffhunk://#diff-2069d4f956ab606ae6d51b191439283798adaf3a6648542c409d258131617059L6-R6)
[[2]](diffhunk://#diff-2069d4f956ab606ae6d51b191439283798adaf3a6648542c409d258131617059L84-R82)
#### Enhanced workflows:
- Updated instructions for building and viewing documentation locally,
including tips for specifying server ports and handling API reference
previews.
[[1]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L60-R94)
[[2]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L82-R126)
- Expanded guidance on cleaning documentation artifacts and using
linting tools effectively.
[[1]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L82-R126)
[[2]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L107-R142)
#### API reference documentation:
- Improved instructions for generating and formatting in-code
documentation, highlighting best practices for docstring writing.
[[1]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L107-R142)
[[2]](diffhunk://#diff-048deddcfd44b242e5b23aed9f2e9ec73afc672244ce14df2a0a316d95840c87L144-R186)
---
### Minor Changes:
- Added support for a new package name (`langchain_v1`) in the API
documentation generation script.
- Fixed minor capitalization and formatting issues in documentation
files.
[[1]](diffhunk://#diff-2069d4f956ab606ae6d51b191439283798adaf3a6648542c409d258131617059L40-R40)
[[2]](diffhunk://#diff-2069d4f956ab606ae6d51b191439283798adaf3a6648542c409d258131617059L166-R160)
---------
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
240 lines
8.0 KiB
Python
240 lines
8.0 KiB
Python
import multiprocessing
|
|
import os
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Iterable, Tuple
|
|
|
|
import nbformat
|
|
from nbconvert.exporters import MarkdownExporter
|
|
from nbconvert.preprocessors import Preprocessor
|
|
|
|
HIDE_IN_NB_MAGIC_OPEN = "<!-- HIDE_IN_NB"
|
|
HIDE_IN_NB_MAGIC_CLOSE = "HIDE_IN_NB -->"
|
|
|
|
|
|
class EscapePreprocessor(Preprocessor):
|
|
def preprocess_cell(self, cell, resources, index):
|
|
if cell.cell_type == "markdown":
|
|
# rewrite .ipynb links to .md
|
|
cell.source = re.sub(
|
|
r"\[([^\]]*)\]\((?![^\)]*//)([^)]*)\.ipynb\)",
|
|
r"[\1](\2.md)",
|
|
cell.source,
|
|
)
|
|
|
|
elif cell.cell_type == "code":
|
|
# escape ``` in code
|
|
cell.source = cell.source.replace("```", r"\`\`\`")
|
|
# escape ``` in output
|
|
|
|
# allow overriding title based on comment at beginning of cell
|
|
if cell.source.startswith("# title="):
|
|
lines = cell.source.split("\n")
|
|
title = lines[0].split("# title=")[1]
|
|
if title.startswith('"') and title.endswith('"'):
|
|
title = title[1:-1]
|
|
cell.metadata["title"] = title
|
|
cell.source = "\n".join(lines[1:])
|
|
|
|
if "outputs" in cell:
|
|
filter_out = set()
|
|
for i, output in enumerate(cell["outputs"]):
|
|
if "text" in output:
|
|
if not output["text"].strip():
|
|
filter_out.add(i)
|
|
continue
|
|
output["text"] = output["text"].replace("```", r"\`\`\`")
|
|
elif "data" in output:
|
|
for key, value in output["data"].items():
|
|
if isinstance(value, str):
|
|
output["data"][key] = value.replace("```", r"\`\`\`")
|
|
cell["outputs"] = [
|
|
output
|
|
for i, output in enumerate(cell["outputs"])
|
|
if i not in filter_out
|
|
]
|
|
|
|
return cell, resources
|
|
|
|
|
|
class ExtractAttachmentsPreprocessor(Preprocessor):
|
|
"""
|
|
Extracts all of the outputs from the notebook file. The extracted
|
|
outputs are returned in the 'resources' dictionary.
|
|
"""
|
|
|
|
def preprocess_cell(self, cell, resources, index):
|
|
"""
|
|
Apply a transformation on each cell,
|
|
Parameters
|
|
----------
|
|
cell : NotebookNode cell
|
|
Notebook cell being processed
|
|
resources : dictionary
|
|
Additional resources used in the conversion process. Allows
|
|
preprocessors to pass variables into the Jinja engine.
|
|
cell_index : int
|
|
Index of the cell being processed (see base.py)
|
|
"""
|
|
|
|
# Get files directory if it has been specified
|
|
|
|
# Make sure outputs key exists
|
|
if not isinstance(resources["outputs"], dict):
|
|
resources["outputs"] = {}
|
|
|
|
# Loop through all of the attachments in the cell
|
|
for name, attach in cell.get("attachments", {}).items():
|
|
for mime, data in attach.items():
|
|
if mime not in {
|
|
"image/png",
|
|
"image/jpeg",
|
|
"image/svg+xml",
|
|
"application/pdf",
|
|
}:
|
|
continue
|
|
|
|
# attachments are pre-rendered. Only replace markdown-formatted
|
|
# images with the following logic
|
|
attach_str = f"({name})"
|
|
if attach_str in cell.source:
|
|
data = f"(data:{mime};base64,{data})"
|
|
cell.source = cell.source.replace(attach_str, data)
|
|
|
|
return cell, resources
|
|
|
|
|
|
class CustomRegexRemovePreprocessor(Preprocessor):
|
|
def check_conditions(self, cell):
|
|
pattern = re.compile(r"(?s)(?:\s*\Z)|(?:.*#\s*\|\s*output:\s*false.*)")
|
|
rtn = not pattern.match(cell.source)
|
|
if not rtn:
|
|
return False
|
|
else:
|
|
return True
|
|
|
|
def preprocess(self, nb, resources):
|
|
nb.cells = [cell for cell in nb.cells if self.check_conditions(cell)]
|
|
|
|
return nb, resources
|
|
|
|
|
|
class UnHidePreprocessor(Preprocessor):
|
|
def preprocess_cell(self, cell, resources, index):
|
|
cell.source = cell.source.replace(HIDE_IN_NB_MAGIC_OPEN, "")
|
|
cell.source = cell.source.replace(HIDE_IN_NB_MAGIC_CLOSE, "")
|
|
return cell, resources
|
|
|
|
|
|
exporter = MarkdownExporter(
|
|
preprocessors=[
|
|
EscapePreprocessor,
|
|
ExtractAttachmentsPreprocessor,
|
|
CustomRegexRemovePreprocessor,
|
|
UnHidePreprocessor,
|
|
],
|
|
template_name="mdoutput",
|
|
extra_template_basedirs=["./scripts/notebook_convert_templates"],
|
|
)
|
|
|
|
|
|
def _process_path(tup: Tuple[Path, Path, Path]):
|
|
notebook_path, intermediate_docs_dir, output_docs_dir = tup
|
|
relative = notebook_path.relative_to(intermediate_docs_dir)
|
|
output_path = output_docs_dir / relative.parent / (relative.stem + ".md")
|
|
_convert_notebook(notebook_path, output_path, intermediate_docs_dir)
|
|
|
|
|
|
def _modify_frontmatter(
|
|
body: str, notebook_path: Path, intermediate_docs_dir: Path
|
|
) -> str:
|
|
# if frontmatter exists
|
|
rel_path = notebook_path.relative_to(intermediate_docs_dir).as_posix()
|
|
edit_url = (
|
|
f"https://github.com/langchain-ai/langchain/edit/master/docs/docs/{rel_path}"
|
|
)
|
|
frontmatter = {
|
|
"custom_edit_url": edit_url,
|
|
}
|
|
if re.match(r"^[\s\n]*---\n", body):
|
|
# frontmatter already present
|
|
|
|
for k, v in frontmatter.items():
|
|
# if key already exists, leave it
|
|
if re.match(f"{k}: ", body):
|
|
continue
|
|
else:
|
|
body = re.sub(r"^[\s\n]*---\n", f"---\n{k}: {v}\n", body, count=1)
|
|
return body
|
|
else:
|
|
insert = "\n".join([f"{k}: {v}" for k, v in frontmatter.items()])
|
|
return f"---\n{insert}\n---\n{body}"
|
|
|
|
|
|
def _convert_notebook(
|
|
notebook_path: Path, output_path: Path, intermediate_docs_dir: Path
|
|
) -> Path:
|
|
import json
|
|
import uuid
|
|
|
|
with open(notebook_path, "r", encoding="utf-8") as f:
|
|
nb_json = json.load(f)
|
|
|
|
# Fix missing and duplicate cell IDs before nbformat validation
|
|
seen_ids = set()
|
|
for cell in nb_json.get("cells", []):
|
|
if "id" not in cell or not cell.get("id") or cell.get("id") in seen_ids:
|
|
cell["id"] = str(uuid.uuid4())[:8]
|
|
seen_ids.add(cell["id"])
|
|
|
|
nb = nbformat.reads(json.dumps(nb_json), as_version=4)
|
|
|
|
# Upgrade notebook format
|
|
nb = nbformat.v4.upgrade(nb)
|
|
|
|
body, resources = exporter.from_notebook_node(nb)
|
|
|
|
body = _modify_frontmatter(body, notebook_path, intermediate_docs_dir)
|
|
|
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(output_path, "w") as f:
|
|
f.write(body)
|
|
|
|
return output_path
|
|
|
|
|
|
if __name__ == "__main__":
|
|
intermediate_docs_dir = Path(sys.argv[1])
|
|
output_docs_dir = Path(sys.argv[2])
|
|
|
|
source_paths_arg = os.environ.get("SOURCE_PATHS")
|
|
source_paths: Iterable[Path]
|
|
if source_paths_arg:
|
|
source_path_strs = re.split(r"\s+", source_paths_arg)
|
|
source_paths_stripped = [p.strip() for p in source_path_strs]
|
|
source_paths = [intermediate_docs_dir / p for p in source_paths_stripped if p]
|
|
else:
|
|
original_paths = list(intermediate_docs_dir.glob("**/*.ipynb"))
|
|
# exclude files that exist in output directory and are newer
|
|
relative_paths = [p.relative_to(intermediate_docs_dir) for p in original_paths]
|
|
out_paths = [
|
|
output_docs_dir / p.parent / (p.stem + ".md") for p in relative_paths
|
|
]
|
|
source_paths = [
|
|
p
|
|
for p, o in zip(original_paths, out_paths)
|
|
if not o.exists() or o.stat().st_mtime < p.stat().st_mtime
|
|
]
|
|
print(f"rebuilding {len(source_paths)}/{len(relative_paths)} notebooks")
|
|
|
|
with multiprocessing.Pool() as pool:
|
|
pool.map(
|
|
_process_path,
|
|
(
|
|
(notebook_path, intermediate_docs_dir, output_docs_dir)
|
|
for notebook_path in source_paths
|
|
),
|
|
)
|