infra: check templates based on integration (#24857)

instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed
2025-11-02 09:14:45 +00:00 · 2024-07-31 13:19:50 -07:00
parent a7380dd531
commit 17a06cb7a6
1 changed files with 67 additions and 47 deletions
--- a/docs/scripts/check_templates.py
+++ b/docs/scripts/check_templates.py
@@ -1,69 +1,89 @@
 import json
 import re
 import sys
 from functools import cache
 from pathlib import Path
-from typing import Union
+from typing import Dict, Iterable, List, Union
 CURR_DIR = Path(__file__).parent.absolute()
-
+CLI_TEMPLATE_DIR = (
-CHAT_MODEL_HEADERS = (
+    CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
    "## Overview",
    "### Integration details",
    "### Model features",
    "## Setup",
    "## Instantiation",
    "## Invocation",
    "## Chaining",
    "## API reference",
 )
 CHAT_MODEL_REGEX = r".*".join(CHAT_MODEL_HEADERS)
-DOCUMENT_LOADER_HEADERS = (
+INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
-    "## Overview",
+    "chat": {
-    "### Integration details",
+        "issue_number": 22296,
-    "### Loader features",
+    },
-    "## Setup",
+    "document_loaders": {
-    "## Instantiation",
+        "issue_number": 22866,
-    "## Load",
+    },
-    "## Lazy Load",
+    "stores": {},
-    "## API reference",
+    "llms": {
-)
+        "issue_number": 24803,
-DOCUMENT_LOADER_REGEX = r".*".join(DOCUMENT_LOADER_HEADERS)
+    },
    "text_embedding": {"issue_number": 14856},
    "toolkits": {"issue_number": "TODO"},
    "tools": {"issue_number": "TODO"},
    "vectorstores": {"issue_number": 24800},
    "retrievers": {"issue_number": "TODO"},
 }
-def check_chat_model(path: Path) -> None:
+@cache
 def _get_headers(doc_dir: str) -> Iterable[str]:
    """Gets all markdown headers ## and below from the integration template.
    Ignores headers that contain "TODO"."""
    ipynb_name = f"{doc_dir}.ipynb"
    if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
        raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
    with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
        nb = json.load(f)
    headers: List[str] = []
    for cell in nb["cells"]:
        if cell["cell_type"] == "markdown":
            for line in cell["source"]:
                if not line.startswith("##") or "TODO" in line:
                    continue
                header = line.strip()
                headers.append(header)
    return headers
 def check_header_order(path: Path) -> None:
    doc_dir = path.parent.name
    if doc_dir not in INFO_BY_DIR:
        # Skip if not a directory we care about
        return
    headers = _get_headers(doc_dir)
    issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")
    print(f"Checking {doc_dir} page {path}")
    with open(path, "r") as f:
        doc = f.read()
-    if not re.search(CHAT_MODEL_REGEX, doc, re.DOTALL):
+    regex = r".*".join(headers)
-        raise ValueError(
+    if not re.search(regex, doc, re.DOTALL):
-            f"Document {path} does not match the ChatModel Integration page template. "
+        issueline = (
-            f"Please see https://github.com/langchain-ai/langchain/issues/22296 for "
+            (
-            f"instructions on how to correctly format a ChatModel Integration page."
+                " Please see https://github.com/langchain-ai/langchain/issues/"
                f"{issue_number} for instructions on how to correctly format a "
                f"{doc_dir} integration page."
            )
            if isinstance(issue_number, int)
            else ""
        )
 def check_document_loader(path: Path) -> None:
    with open(path, "r") as f:
        doc = f.read()
    if not re.search(DOCUMENT_LOADER_REGEX, doc, re.DOTALL):
        raise ValueError(
-            f"Document {path} does not match the DocumentLoader Integration page template. "
+            f"Document {path} does not match the expected header order.{issueline}"
            f"Please see https://github.com/langchain-ai/langchain/issues/22866 for "
            f"instructions on how to correctly format a DocumentLoader Integration page."
        )
 def main(*new_doc_paths: Union[str, Path]) -> None:
    for path in new_doc_paths:
        path = Path(path).resolve().absolute()
-        if CURR_DIR.parent / "docs" / "integrations" / "chat" in path.parents:
+        if CURR_DIR.parent / "docs" / "integrations" in path.parents:
-            print(f"Checking chat model page {path}")
+            check_header_order(path)
            check_chat_model(path)
        elif (
            CURR_DIR.parent / "docs" / "integrations" / "document_loaders"
            in path.parents
        ):
            print(f"Checking document loader page {path}")
            check_document_loader(path)
        else:
            continue