mirror of
https://github.com/hwchase17/langchain.git
synced 2025-04-28 20:05:58 +00:00
infra: check templates based on integration (#24857)
instead of hardcoding a linter for each, iterate through the lines of the template notebook and find lines that start with `##` (includes lower headings), and enforce that those headings are found in new docs that are contributed
This commit is contained in:
parent
a7380dd531
commit
17a06cb7a6
@ -1,69 +1,89 @@
|
|||||||
|
import json
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
from functools import cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Union
|
from typing import Dict, Iterable, List, Union
|
||||||
|
|
||||||
CURR_DIR = Path(__file__).parent.absolute()
|
CURR_DIR = Path(__file__).parent.absolute()
|
||||||
|
CLI_TEMPLATE_DIR = (
|
||||||
CHAT_MODEL_HEADERS = (
|
CURR_DIR.parent.parent / "libs/cli/langchain_cli/integration_template/docs"
|
||||||
"## Overview",
|
|
||||||
"### Integration details",
|
|
||||||
"### Model features",
|
|
||||||
"## Setup",
|
|
||||||
"## Instantiation",
|
|
||||||
"## Invocation",
|
|
||||||
"## Chaining",
|
|
||||||
"## API reference",
|
|
||||||
)
|
)
|
||||||
CHAT_MODEL_REGEX = r".*".join(CHAT_MODEL_HEADERS)
|
|
||||||
|
|
||||||
DOCUMENT_LOADER_HEADERS = (
|
INFO_BY_DIR: Dict[str, Dict[str, Union[int, str]]] = {
|
||||||
"## Overview",
|
"chat": {
|
||||||
"### Integration details",
|
"issue_number": 22296,
|
||||||
"### Loader features",
|
},
|
||||||
"## Setup",
|
"document_loaders": {
|
||||||
"## Instantiation",
|
"issue_number": 22866,
|
||||||
"## Load",
|
},
|
||||||
"## Lazy Load",
|
"stores": {},
|
||||||
"## API reference",
|
"llms": {
|
||||||
)
|
"issue_number": 24803,
|
||||||
DOCUMENT_LOADER_REGEX = r".*".join(DOCUMENT_LOADER_HEADERS)
|
},
|
||||||
|
"text_embedding": {"issue_number": 14856},
|
||||||
|
"toolkits": {"issue_number": "TODO"},
|
||||||
|
"tools": {"issue_number": "TODO"},
|
||||||
|
"vectorstores": {"issue_number": 24800},
|
||||||
|
"retrievers": {"issue_number": "TODO"},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def check_chat_model(path: Path) -> None:
|
@cache
|
||||||
|
def _get_headers(doc_dir: str) -> Iterable[str]:
|
||||||
|
"""Gets all markdown headers ## and below from the integration template.
|
||||||
|
|
||||||
|
Ignores headers that contain "TODO"."""
|
||||||
|
ipynb_name = f"{doc_dir}.ipynb"
|
||||||
|
if not (CLI_TEMPLATE_DIR / ipynb_name).exists():
|
||||||
|
raise FileNotFoundError(f"Could not find {ipynb_name} in {CLI_TEMPLATE_DIR}")
|
||||||
|
with open(CLI_TEMPLATE_DIR / ipynb_name, "r") as f:
|
||||||
|
nb = json.load(f)
|
||||||
|
|
||||||
|
headers: List[str] = []
|
||||||
|
for cell in nb["cells"]:
|
||||||
|
if cell["cell_type"] == "markdown":
|
||||||
|
for line in cell["source"]:
|
||||||
|
if not line.startswith("##") or "TODO" in line:
|
||||||
|
continue
|
||||||
|
header = line.strip()
|
||||||
|
headers.append(header)
|
||||||
|
return headers
|
||||||
|
|
||||||
|
|
||||||
|
def check_header_order(path: Path) -> None:
|
||||||
|
doc_dir = path.parent.name
|
||||||
|
if doc_dir not in INFO_BY_DIR:
|
||||||
|
# Skip if not a directory we care about
|
||||||
|
return
|
||||||
|
headers = _get_headers(doc_dir)
|
||||||
|
issue_number = INFO_BY_DIR[doc_dir].get("issue_number", "nonexistent")
|
||||||
|
|
||||||
|
print(f"Checking {doc_dir} page {path}")
|
||||||
|
|
||||||
with open(path, "r") as f:
|
with open(path, "r") as f:
|
||||||
doc = f.read()
|
doc = f.read()
|
||||||
if not re.search(CHAT_MODEL_REGEX, doc, re.DOTALL):
|
regex = r".*".join(headers)
|
||||||
raise ValueError(
|
if not re.search(regex, doc, re.DOTALL):
|
||||||
f"Document {path} does not match the ChatModel Integration page template. "
|
issueline = (
|
||||||
f"Please see https://github.com/langchain-ai/langchain/issues/22296 for "
|
(
|
||||||
f"instructions on how to correctly format a ChatModel Integration page."
|
" Please see https://github.com/langchain-ai/langchain/issues/"
|
||||||
|
f"{issue_number} for instructions on how to correctly format a "
|
||||||
|
f"{doc_dir} integration page."
|
||||||
|
)
|
||||||
|
if isinstance(issue_number, int)
|
||||||
|
else ""
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def check_document_loader(path: Path) -> None:
|
|
||||||
with open(path, "r") as f:
|
|
||||||
doc = f.read()
|
|
||||||
if not re.search(DOCUMENT_LOADER_REGEX, doc, re.DOTALL):
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Document {path} does not match the DocumentLoader Integration page template. "
|
f"Document {path} does not match the expected header order.{issueline}"
|
||||||
f"Please see https://github.com/langchain-ai/langchain/issues/22866 for "
|
|
||||||
f"instructions on how to correctly format a DocumentLoader Integration page."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def main(*new_doc_paths: Union[str, Path]) -> None:
|
def main(*new_doc_paths: Union[str, Path]) -> None:
|
||||||
for path in new_doc_paths:
|
for path in new_doc_paths:
|
||||||
path = Path(path).resolve().absolute()
|
path = Path(path).resolve().absolute()
|
||||||
if CURR_DIR.parent / "docs" / "integrations" / "chat" in path.parents:
|
if CURR_DIR.parent / "docs" / "integrations" in path.parents:
|
||||||
print(f"Checking chat model page {path}")
|
check_header_order(path)
|
||||||
check_chat_model(path)
|
|
||||||
elif (
|
|
||||||
CURR_DIR.parent / "docs" / "integrations" / "document_loaders"
|
|
||||||
in path.parents
|
|
||||||
):
|
|
||||||
print(f"Checking document loader page {path}")
|
|
||||||
check_document_loader(path)
|
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user