infra: check templates based on integration (#24857)

instead of hardcoding a linter for each, iterate through the lines of
the template notebook and find lines that start with `##` (includes
lower headings), and enforce that those headings are found in new docs
that are contributed
This commit is contained in:
Erick Friis 2024-07-31 13:19:50 -07:00 committed by GitHub
parent a7380dd531
commit 17a06cb7a6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,69 +1,89 @@
import json
import re import re
import sys import sys
from functools import cache
from pathlib import Path from pathlib import Path
from typing import Union from typing import Dict, Iterable, List, Union
CURR_DIR = Path(__file__).parent.absolute() CURR_DIR = Path(__file__).parent.absolute()
CLI_TEMPLATE_DIR = (
CHAT_MODEL_HEADERS = ( CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
"## Overview",
"### Integration details",
"### Model features",
"## Setup",
"## Instantiation",
"## Invocation",
"## Chaining",
"## API reference",
) )
CHAT_MODEL_REGEX = r".*".join(CHAT_MODEL_HEADERS)
DOCUMENT_LOADER_HEADERS = ( INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
"## Overview", "chat": {
"### Integration details", "issue_number": 22296,
"### Loader features", },
"## Setup", "document_loaders": {
"## Instantiation", "issue_number": 22866,
"## Load", },
"## Lazy Load", "stores": {},
"## API reference", "llms": {
) "issue_number": 24803,
DOCUMENT_LOADER_REGEX = r".*".join(DOCUMENT_LOADER_HEADERS) },
"text_embedding": {"issue_number": 14856},
"toolkits": {"issue_number": "TODO"},
"tools": {"issue_number": "TODO"},
"vectorstores": {"issue_number": 24800},
"retrievers": {"issue_number": "TODO"},
}
def check_chat_model(path: Path) -> None: @cache
def _get_headers(doc_dir: str) -> Iterable[str]:
"""Gets all markdown headers ## and below from the integration template.
Ignores headers that contain "TODO"."""
ipynb_name = f"{doc_dir}.ipynb"
if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
nb = json.load(f)
headers: List[str] = []
for cell in nb["cells"]:
if cell["cell_type"] == "markdown":
for line in cell["source"]:
if not line.startswith("##") or "TODO" in line:
continue
header = line.strip()
headers.append(header)
return headers
def check_header_order(path: Path) -> None:
doc_dir = path.parent.name
if doc_dir not in INFO_BY_DIR:
# Skip if not a directory we care about
return
headers = _get_headers(doc_dir)
issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")
print(f"Checking {doc_dir} page {path}")
with open(path, "r") as f: with open(path, "r") as f:
doc = f.read() doc = f.read()
if not re.search(CHAT_MODEL_REGEX, doc, re.DOTALL): regex = r".*".join(headers)
raise ValueError( if not re.search(regex, doc, re.DOTALL):
f"Document {path} does not match the ChatModel Integration page template. " issueline = (
f"Please see https://github.com/langchain-ai/langchain/issues/22296 for " (
f"instructions on how to correctly format a ChatModel Integration page." " Please see https://github.com/langchain-ai/langchain/issues/"
f"{issue_number} for instructions on how to correctly format a "
f"{doc_dir} integration page."
)
if isinstance(issue_number, int)
else ""
) )
def check_document_loader(path: Path) -> None:
with open(path, "r") as f:
doc = f.read()
if not re.search(DOCUMENT_LOADER_REGEX, doc, re.DOTALL):
raise ValueError( raise ValueError(
f"Document {path} does not match the DocumentLoader Integration page template. " f"Document {path} does not match the expected header order.{issueline}"
f"Please see https://github.com/langchain-ai/langchain/issues/22866 for "
f"instructions on how to correctly format a DocumentLoader Integration page."
) )
def main(*new_doc_paths: Union[str, Path]) -> None: def main(*new_doc_paths: Union[str, Path]) -> None:
for path in new_doc_paths: for path in new_doc_paths:
path = Path(path).resolve().absolute() path = Path(path).resolve().absolute()
if CURR_DIR.parent / "docs" / "integrations" / "chat" in path.parents: if CURR_DIR.parent / "docs" / "integrations" in path.parents:
print(f"Checking chat model page {path}") check_header_order(path)
check_chat_model(path)
elif (
CURR_DIR.parent / "docs" / "integrations" / "document_loaders"
in path.parents
):
print(f"Checking document loader page {path}")
check_document_loader(path)
else: else:
continue continue