Merge branch 'master' into erick/docs-rewrite-contributor-docs

2026-04-08 05:23:10 +00:00 · 2024-05-28 16:18:13 -07:00
parent 5491993f8a d61bdeba25
commit cf31a0a3f0
510 changed files with 19885 additions and 1927 deletions
--- a/docs/scripts/arxiv_references.py
+++ b/docs/scripts/arxiv_references.py
@@ -7,7 +7,7 @@ import os
 import re
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Dict, List, Set
+from typing import Any, Dict

 from pydantic.v1 import BaseModel, root_validator

@@ -17,6 +17,7 @@ _ROOT_DIR = Path(os.path.abspath(__file__)).parents[2]
 DOCS_DIR = _ROOT_DIR / "docs" / "docs"
 CODE_DIR = _ROOT_DIR / "libs"
 TEMPLATES_DIR = _ROOT_DIR / "templates"
+COOKBOOKS_DIR = _ROOT_DIR / "cookbook"
 ARXIV_ID_PATTERN = r"https://arxiv\.org/(abs|pdf)/(\d+\.\d+)"
 LANGCHAIN_PYTHON_URL = "python.langchain.com"

@@ -29,6 +30,7 @@ class ArxivPaper:
    referencing_doc2url: dict[str, str]
    referencing_api_ref2url: dict[str, str]
    referencing_template2url: dict[str, str]
+    referencing_cookbook2url: dict[str, str]
    title: str
    authors: list[str]
    abstract: str
@@ -50,7 +52,6 @@ def search_documentation_for_arxiv_references(docs_dir: Path) -> dict[str, set[s
    arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
    exclude_strings = {"file_path", "metadata", "link", "loader", "PyPDFLoader"}

-    # loop all the files (ipynb, mdx, md) in the docs folder
    files = (
        p.resolve()
        for p in Path(docs_dir).glob("**/*")
@@ -76,39 +77,6 @@ def search_documentation_for_arxiv_references(docs_dir: Path) -> dict[str, set[s
    return arxiv_id2file_names


-def convert_module_name_and_members_to_urls(
-    arxiv_id2module_name_and_members: dict[str, set[str]],
-) -> dict[str, set[str]]:
-    arxiv_id2urls = {}
-    for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items():
-        urls = set()
-        for module_name_and_member in module_name_and_members:
-            module_name, type_and_member = module_name_and_member.split(":")
-            if "$" in type_and_member:
-                type, member = type_and_member.split("$")
-            else:
-                type = type_and_member
-                member = ""
-            _namespace_parts = module_name.split(".")
-            if type == "module":
-                first_namespace_part = _namespace_parts[0]
-                if first_namespace_part.startswith("langchain_"):
-                    first_namespace_part = first_namespace_part.replace(
-                        "langchain_", ""
-                    )
-                url = f"{first_namespace_part}_api_reference.html#module-{module_name}"
-            elif type in ["class", "function"]:
-                second_namespace_part = _namespace_parts[1]
-                url = f"{second_namespace_part}/{module_name}.{member}.html#{module_name}.{member}"
-            else:
-                raise ValueError(
-                    f"Unknown type: {type} in the {module_name_and_member}."
-                )
-            urls.add(url)
-        arxiv_id2urls[arxiv_id] = urls
-    return arxiv_id2urls
-
-
 def search_code_for_arxiv_references(code_dir: Path) -> dict[str, set[str]]:
    """Search the code for arXiv references.

@@ -220,7 +188,6 @@ def search_code_for_arxiv_references(code_dir: Path) -> dict[str, set[str]]:

 def search_templates_for_arxiv_references(templates_dir: Path) -> dict[str, set[str]]:
    arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
-    # exclude_strings = {"file_path", "metadata", "link", "loader", "PyPDFLoader"}

    # loop all the Readme.md files since they are parsed into LangChain documentation
    # exclude the Readme.md in the root folder
@@ -234,8 +201,6 @@ def search_templates_for_arxiv_references(templates_dir: Path) -> dict[str, set[
        with open(file, "r", encoding="utf-8") as f:
            lines = f.readlines()
            for line in lines:
-                # if any(exclude_string in line for exclude_string in exclude_strings):
-                #     continue
                matches = arxiv_url_pattern.search(line)
                if matches:
                    arxiv_id = matches.group(2)
@@ -247,6 +212,58 @@ def search_templates_for_arxiv_references(templates_dir: Path) -> dict[str, set[
    return arxiv_id2template_names


+def search_cookbooks_for_arxiv_references(cookbooks_dir: Path) -> dict[str, set[str]]:
+    arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
+    files = (p.resolve() for p in Path(cookbooks_dir).glob("**/*.ipynb"))
+    arxiv_id2cookbook_names: dict[str, set[str]] = {}
+    for file in files:
+        with open(file, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+            for line in lines:
+                matches = arxiv_url_pattern.search(line)
+                if matches:
+                    arxiv_id = matches.group(2)
+                    cookbook_name = file.stem
+                    if arxiv_id not in arxiv_id2cookbook_names:
+                        arxiv_id2cookbook_names[arxiv_id] = {cookbook_name}
+                    else:
+                        arxiv_id2cookbook_names[arxiv_id].add(cookbook_name)
+    return arxiv_id2cookbook_names
+
+
+def convert_module_name_and_members_to_urls(
+    arxiv_id2module_name_and_members: dict[str, set[str]],
+) -> dict[str, set[str]]:
+    arxiv_id2urls = {}
+    for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items():
+        urls = set()
+        for module_name_and_member in module_name_and_members:
+            module_name, type_and_member = module_name_and_member.split(":")
+            if "$" in type_and_member:
+                type_, member = type_and_member.split("$")
+            else:
+                type_ = type_and_member
+                member = ""
+            _namespace_parts = module_name.split(".")
+            if type_ == "module":
+                first_namespace_part = _namespace_parts[0]
+                if first_namespace_part.startswith("langchain_"):
+                    first_namespace_part = first_namespace_part.replace(
+                        "langchain_", ""
+                    )
+                url = f"{first_namespace_part}_api_reference.html#module-{module_name}"
+            elif type_ in ["class", "function"]:
+                second_namespace_part = _namespace_parts[1]
+                url = f"{second_namespace_part}/{module_name}.{member}.html#{module_name}.{member}"
+            else:
+                raise ValueError(
+                    f"Unknown type: {type_} in the {module_name_and_member}."
+                )
+            urls.add(url)
+        arxiv_id2urls[arxiv_id] = urls
+    return arxiv_id2urls
+
+
 def _get_doc_path(file_parts: tuple[str, ...], file_extension) -> str:
    """Get the relative path to the documentation page
    from the absolute path of the file.
@@ -285,60 +302,6 @@ def _get_module_name(file_parts: tuple[str, ...]) -> str:
    return ".".join(ns_parts)


-def compound_urls(
-    arxiv_id2file_names: dict[str, set[str]],
-    arxiv_id2code_urls: dict[str, set[str]],
-    arxiv_id2templates: dict[str, set[str]],
-) -> dict[str, dict[str, set[str]]]:
-    # format urls and verify that the urls are correct
-    arxiv_id2file_names_new = {}
-    for arxiv_id, file_names in arxiv_id2file_names.items():
-        key2urls = {
-            key: _format_doc_url(key)
-            for key in file_names
-            if _is_url_ok(_format_doc_url(key))
-        }
-        if key2urls:
-            arxiv_id2file_names_new[arxiv_id] = key2urls
-
-    arxiv_id2code_urls_new = {}
-    for arxiv_id, code_urls in arxiv_id2code_urls.items():
-        key2urls = {
-            key: _format_api_ref_url(key)
-            for key in code_urls
-            if _is_url_ok(_format_api_ref_url(key))
-        }
-        if key2urls:
-            arxiv_id2code_urls_new[arxiv_id] = key2urls
-
-    arxiv_id2templates_new = {}
-    for arxiv_id, templates in arxiv_id2templates.items():
-        key2urls = {
-            key: _format_template_url(key)
-            for key in templates
-            if _is_url_ok(_format_template_url(key))
-        }
-        if key2urls:
-            arxiv_id2templates_new[arxiv_id] = key2urls
-
-    arxiv_id2type2key2urls = dict.fromkeys(
-        arxiv_id2file_names_new | arxiv_id2code_urls_new | arxiv_id2templates_new
-    )
-    arxiv_id2type2key2urls = {k: {} for k in arxiv_id2type2key2urls}
-    for arxiv_id, key2urls in arxiv_id2file_names_new.items():
-        arxiv_id2type2key2urls[arxiv_id]["docs"] = key2urls
-    for arxiv_id, key2urls in arxiv_id2code_urls_new.items():
-        arxiv_id2type2key2urls[arxiv_id]["apis"] = key2urls
-    for arxiv_id, key2urls in arxiv_id2templates_new.items():
-        arxiv_id2type2key2urls[arxiv_id]["templates"] = key2urls
-
-    # reverse sort by the arxiv_id (the newest papers first)
-    ret = dict(
-        sorted(arxiv_id2type2key2urls.items(), key=lambda item: item[0], reverse=True)
-    )
-    return ret
-
-
 def _is_url_ok(url: str) -> bool:
    """Check if the url page is open without error."""
    import requests
@@ -389,7 +352,7 @@ class ArxivAPIWrapper(BaseModel):

        Returns:
            List of ArxivPaper objects.
-        """  # noqa: E501
+        """

        def cut_authors(authors: list) -> list[str]:
            if len(authors) > 3:
@@ -424,6 +387,9 @@ class ArxivAPIWrapper(BaseModel):
                referencing_template2url=type2key2urls["templates"]
                if "templates" in type2key2urls
                else {},
+                referencing_cookbook2url=type2key2urls["cookbooks"]
+                if "cookbooks" in type2key2urls
+                else {},
            )
            for result, type2key2urls in zip(results, arxiv_id2type2key2urls.values())
        ]
@@ -443,6 +409,10 @@ def _format_template_url(template_name: str) -> str:
    return f"https://{LANGCHAIN_PYTHON_URL}/docs/templates/{template_name}"


+def _format_cookbook_url(cookbook_name: str) -> str:
+    return f"https://github.com/langchain-ai/langchain/blob/master/cookbook/{cookbook_name}.ipynb"
+
+
 def _compact_module_full_name(doc_path: str) -> str:
    # agents/langchain_core.agents.AgentAction.html#langchain_core.agents.AgentAction
    module = doc_path.split("#")[1].replace("module-", "")
@@ -454,9 +424,79 @@ def _compact_module_full_name(doc_path: str) -> str:
    return module


+def compound_urls(
+    arxiv_id2file_names: dict[str, set[str]],
+    arxiv_id2code_urls: dict[str, set[str]],
+    arxiv_id2templates: dict[str, set[str]],
+    arxiv_id2cookbooks: dict[str, set[str]],
+) -> dict[str, dict[str, set[str]]]:
+    # format urls and verify that the urls are correct
+    arxiv_id2file_names_new = {}
+    for arxiv_id, file_names in arxiv_id2file_names.items():
+        key2urls = {
+            key: _format_doc_url(key)
+            for key in file_names
+            if _is_url_ok(_format_doc_url(key))
+        }
+        if key2urls:
+            arxiv_id2file_names_new[arxiv_id] = key2urls
+
+    arxiv_id2code_urls_new = {}
+    for arxiv_id, code_urls in arxiv_id2code_urls.items():
+        key2urls = {
+            key: _format_api_ref_url(key)
+            for key in code_urls
+            if _is_url_ok(_format_api_ref_url(key))
+        }
+        if key2urls:
+            arxiv_id2code_urls_new[arxiv_id] = key2urls
+
+    arxiv_id2templates_new = {}
+    for arxiv_id, templates in arxiv_id2templates.items():
+        key2urls = {
+            key: _format_template_url(key)
+            for key in templates
+            if _is_url_ok(_format_template_url(key))
+        }
+        if key2urls:
+            arxiv_id2templates_new[arxiv_id] = key2urls
+
+    arxiv_id2cookbooks_new = {}
+    for arxiv_id, cookbooks in arxiv_id2cookbooks.items():
+        key2urls = {
+            key: _format_cookbook_url(key)
+            for key in cookbooks
+            if _is_url_ok(_format_cookbook_url(key))
+        }
+        if key2urls:
+            arxiv_id2cookbooks_new[arxiv_id] = key2urls
+
+    arxiv_id2type2key2urls = dict.fromkeys(
+        arxiv_id2file_names_new
+        | arxiv_id2code_urls_new
+        | arxiv_id2templates_new
+        | arxiv_id2cookbooks_new
+    )
+    arxiv_id2type2key2urls = {k: {} for k in arxiv_id2type2key2urls}
+    for arxiv_id, key2urls in arxiv_id2file_names_new.items():
+        arxiv_id2type2key2urls[arxiv_id]["docs"] = key2urls
+    for arxiv_id, key2urls in arxiv_id2code_urls_new.items():
+        arxiv_id2type2key2urls[arxiv_id]["apis"] = key2urls
+    for arxiv_id, key2urls in arxiv_id2templates_new.items():
+        arxiv_id2type2key2urls[arxiv_id]["templates"] = key2urls
+    for arxiv_id, key2urls in arxiv_id2cookbooks_new.items():
+        arxiv_id2type2key2urls[arxiv_id]["cookbooks"] = key2urls
+
+    # reverse sort by the arxiv_id (the newest papers first)
+    ret = dict(
+        sorted(arxiv_id2type2key2urls.items(), key=lambda item: item[0], reverse=True)
+    )
+    return ret
+
+
 def log_results(arxiv_id2type2key2urls):
    arxiv_ids = arxiv_id2type2key2urls.keys()
-    doc_number, api_number, templates_number = 0, 0, 0
+    doc_number, api_number, templates_number, cookbooks_number = 0, 0, 0, 0
    for type2key2url in arxiv_id2type2key2urls.values():
        if "docs" in type2key2url:
            doc_number += len(type2key2url["docs"])
@@ -464,9 +504,11 @@ def log_results(arxiv_id2type2key2urls):
            api_number += len(type2key2url["apis"])
        if "templates" in type2key2url:
            templates_number += len(type2key2url["templates"])
+        if "cookbooks" in type2key2url:
+            cookbooks_number += len(type2key2url["cookbooks"])
    logger.warning(
        f"Found {len(arxiv_ids)} arXiv references in the {doc_number} docs, {api_number} API Refs,"
-        f" and {templates_number} Templates."
+        f" {templates_number} Templates, and {cookbooks_number} Cookbooks."
    )


@@ -477,7 +519,7 @@ def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) ->
            
 LangChain implements the latest research in the field of Natural Language Processing.
 This page contains `arXiv` papers referenced in the LangChain Documentation, API Reference,
-and Templates.
+ Templates, and Cookbooks.

 ## Summary

@@ -510,6 +552,14 @@ and Templates.
                        for key, url in paper.referencing_template2url.items()
                    )
                ]
+            if paper.referencing_cookbook2url:
+                refs += [
+                    "`Cookbook:` "
+                    + ", ".join(
+                        f"[{key}]({url})"
+                        for key, url in paper.referencing_cookbook2url.items()
+                    )
+                ]
            refs_str = ", ".join(refs)

            title_link = f"[{paper.title}]({paper.url})"
@@ -533,8 +583,17 @@ and Templates.
                if paper.referencing_template2url
                else ""
            )
+            cookbook_refs = (
+                f"   - **Cookbook:** {', '.join(f'[{key}]({url})' for key, url in paper.referencing_cookbook2url.items())}"
+                if paper.referencing_cookbook2url
+                else ""
+            )
            refs = "\n".join(
-                [el for el in [docs_refs, api_ref_refs, template_refs] if el]
+                [
+                    el
+                    for el in [docs_refs, api_ref_refs, template_refs, cookbook_refs]
+                    if el
+                ]
            )
            f.write(f"""
 ## {paper.title}
@@ -562,8 +621,9 @@ def main():
    )
    arxiv_id2file_names = search_documentation_for_arxiv_references(DOCS_DIR)
    arxiv_id2templates = search_templates_for_arxiv_references(TEMPLATES_DIR)
+    arxiv_id2cookbooks = search_cookbooks_for_arxiv_references(COOKBOOKS_DIR)
    arxiv_id2type2key2urls = compound_urls(
-        arxiv_id2file_names, arxiv_id2code_urls, arxiv_id2templates
+        arxiv_id2file_names, arxiv_id2code_urls, arxiv_id2templates, arxiv_id2cookbooks
    )
    log_results(arxiv_id2type2key2urls)

--- a/docs/scripts/copy_templates.py
+++ b/docs/scripts/copy_templates.py
@@ -27,6 +27,7 @@ if __name__ == "__main__":

    sidebar_hidden = """---
 sidebar_class_name: hidden
+custom_edit_url:
 ---

 """
--- a/docs/scripts/generate_api_reference_links.py
+++ b/docs/scripts/generate_api_reference_links.py
@@ -186,7 +186,7 @@ def replace_imports(file):
    data = code_block_re.sub(replacer, data)

    # if all_imports:
-    #     print(f"Adding {len(all_imports)} links for imports in {file}")  # noqa: T201
+    #     print(f"Adding {len(all_imports)} links for imports in {file}")
    with open(file, "w") as f:
        f.write(data)
    return all_imports
--- a/docs/scripts/model_feat_table.py
+++ b/docs/scripts/model_feat_table.py
@@ -24,6 +24,7 @@ CHAT_MODEL_FEAT_TABLE = {
    "ChatMistralAI": {
        "tool_calling": True,
        "structured_output": True,
+        "json_model": True,
        "package": "langchain-mistralai",
        "link": "/docs/integrations/chat/mistralai/",
    },
@@ -80,6 +81,7 @@ CHAT_MODEL_FEAT_TABLE = {
        "link": "/docs/integrations/chat/bedrock/",
    },
    "ChatHuggingFace": {
+        "tool_calling": True,
        "local": True,
        "package": "langchain-huggingface",
        "link": "/docs/integrations/chat/huggingface/",
@@ -102,6 +104,7 @@ LLM_TEMPLATE = """\
 sidebar_position: 1
 sidebar_class_name: hidden
 keywords: [compatibility]
+custom_edit_url:
 ---

 # LLMs
@@ -116,13 +119,14 @@ Each LLM integration can optionally provide native implementations for async, st

 {table}

-"""  # noqa: E501
+"""

 CHAT_MODEL_TEMPLATE = """\
 ---
 sidebar_position: 0
 sidebar_class_name: hidden
 keywords: [compatibility, bind_tools, tool calling, function calling, structured output, with_structured_output, json mode, local model]
+custom_edit_url:
 ---

 # Chat models
@@ -133,7 +137,7 @@ The following table shows all the chat models that support one or more advanced

 {table}

-"""  # noqa: E501
+"""


 def get_llm_table():
--- a/docs/scripts/notebook_convert.py
+++ b/docs/scripts/notebook_convert.py
@@ -112,15 +112,39 @@ def _process_path(tup: Tuple[Path, Path, Path]):
    notebook_path, intermediate_docs_dir, output_docs_dir = tup
    relative = notebook_path.relative_to(intermediate_docs_dir)
    output_path = output_docs_dir / relative.parent / (relative.stem + ".md")
-    _convert_notebook(notebook_path, output_path)
+    _convert_notebook(notebook_path, output_path, intermediate_docs_dir)


-def _convert_notebook(notebook_path: Path, output_path: Path):
+def _modify_frontmatter(
+    body: str, notebook_path: Path, intermediate_docs_dir: Path
+) -> str:
+    # if frontmatter exists
+    rel_path = notebook_path.relative_to(intermediate_docs_dir).as_posix()
+    edit_url = (
+        f"https://github.com/langchain-ai/langchain/edit/master/docs/docs/{rel_path}"
+    )
+    if re.match(r"^[\s\n]*---\n", body):
+        # if custom_edit_url already exists, leave it
+        if re.match(r"custom_edit_url: ", body):
+            return body
+        else:
+            return re.sub(
+                r"^[\s\n]*---\n", f"---\ncustom_edit_url: {edit_url}\n", body, count=1
+            )
+    else:
+        return f"---\ncustom_edit_url: {edit_url}\n---\n{body}"
+
+
+def _convert_notebook(
+    notebook_path: Path, output_path: Path, intermediate_docs_dir: Path
+) -> Path:
    with open(notebook_path) as f:
        nb = nbformat.read(f, as_version=4)

    body, resources = exporter.from_notebook_node(nb)

+    body = _modify_frontmatter(body, notebook_path, intermediate_docs_dir)
+
    output_path.parent.mkdir(parents=True, exist_ok=True)

    with open(output_path, "w") as f:
--- a/docs/scripts/resolve_local_links.py
+++ b/docs/scripts/resolve_local_links.py
@@ -13,8 +13,13 @@ def update_links(doc_path, docs_link):
    # replace relative links
    content = re.sub(r"\]\(\.\/", f"]({docs_link}", content)

+    frontmatter = """---
+custom_edit_url:
+---
+"""
+
    with open(DOCS_DIR / doc_path, "w") as f:
-        f.write(content)
+        f.write(frontmatter + content)


 if __name__ == "__main__":