Add api cross ref linking (#8275)

Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52)
2025-09-07 05:52:15 +00:00 · 2023-07-26 12:38:58 -07:00
parent a612800ef0
commit 01a9b06400
16 changed files with 263 additions and 28 deletions
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -20,3 +20,5 @@ jobs:
        uses: actions/checkout@v3
      - name: Codespell
        uses: codespell-project/actions-codespell@v2
        with:
          skip: guide_imports.json
--- a/docs/api_reference/conf.py
+++ b/docs/api_reference/conf.py
@@ -7,20 +7,66 @@
 # -- Path setup --------------------------------------------------------------
 import json
 import os
 import sys
 from pathlib import Path
 import toml
 from docutils import nodes
 from sphinx.util.docutils import SphinxDirective
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 #
 import os
 import sys
 import toml
 _DIR = Path(__file__).parent.absolute()
 sys.path.insert(0, os.path.abspath("."))
 sys.path.insert(0, os.path.abspath("../../libs/langchain"))
-with open("../../libs/langchain/pyproject.toml") as f:
+with (_DIR.parents[1] / "libs" / "langchain" / "pyproject.toml").open("r") as f:
    data = toml.load(f)
 with (_DIR / "guide_imports.json").open("r") as f:
    imported_classes = json.load(f)
 class ExampleLinksDirective(SphinxDirective):
    """Directive to generate a list of links to examples.
    We have a script that extracts links to API reference docs
    from our notebook examples. This directive uses that information
    to backlink to the examples from the API reference docs."""
    has_content = False
    required_arguments = 1
    def run(self):
        """Run the directive.
        Called any time :example_links:`ClassName` is used
        in the template *.rst files."""
        class_or_func_name = self.arguments[0]
        links = imported_classes.get(class_or_func_name, {})
        list_node = nodes.bullet_list()
        for doc_name, link in links.items():
            item_node = nodes.list_item()
            para_node = nodes.paragraph()
            link_node = nodes.reference()
            link_node["refuri"] = link
            link_node.append(nodes.Text(doc_name))
            para_node.append(link_node)
            item_node.append(para_node)
            list_node.append(item_node)
        if list_node.children:
            title_node = nodes.title()
            title_node.append(nodes.Text(f"Examples using {class_or_func_name}"))
            return [title_node, list_node]
        return [list_node]
 def setup(app):
    app.add_directive("example_links", ExampleLinksDirective)
 # -- Project information -----------------------------------------------------
--- a/docs/api_reference/create_api_rst.py
+++ b/docs/api_reference/create_api_rst.py
@@ -78,6 +78,7 @@ Functions
 .. autosummary::
    :toctree: {module}
    :template: function.rst
    {fstring}
--- a/docs/api_reference/guide_imports.json
+++ b/docs/api_reference/guide_imports.json
--- a/docs/api_reference/modules/evaluation.rst
+++ b/docs/api_reference/modules/evaluation.rst
@@ -1,9 +0,0 @@
 Evaluation
 =======================
 LangChain has a number of convenient evaluation chains you can use off the shelf to grade your models' oupputs.
 .. automodule:: langchain.evaluation
   :members:
   :undoc-members:
   :inherited-members:
--- a/docs/api_reference/templates/class.rst
+++ b/docs/api_reference/templates/class.rst
@@ -26,3 +26,5 @@
   {%- endfor %}
   {% endif %}
   {% endblock %}
 .. example_links:: {{ objname }}
--- a/docs/api_reference/templates/function.rst
+++ b/docs/api_reference/templates/function.rst
@@ -0,0 +1,8 @@
 :mod:`{{module}}`.{{objname}}
 {{ underline }}==============
 .. currentmodule:: {{ module }}
 .. autofunction:: {{ objname }}
 .. example_links:: {{ objname }}
--- a/docs/docs_skeleton/generate_api_reference_links.py
+++ b/docs/docs_skeleton/generate_api_reference_links.py
@@ -0,0 +1,150 @@
 import importlib
 import inspect
 import json
 import logging
 import os
 import re
 from pathlib import Path
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Base URL for all class documentation
 _BASE_URL = "https://api.python.langchain.com/en/latest/"
 # Regular expression to match Python code blocks
 code_block_re = re.compile(r"^(```python\n)(.*?)(```\n)", re.DOTALL | re.MULTILINE)
 # Regular expression to match langchain import lines
 _IMPORT_RE = re.compile(r"(from\s+(langchain\.\w+(\.\w+)*?)\s+import\s+)(\w+)")
 _CURRENT_PATH = Path(__file__).parent.absolute()
 # Directory where generated markdown files are stored
 _DOCS_DIR = _CURRENT_PATH / "docs"
 _JSON_PATH = _CURRENT_PATH.parent / "api_reference" / "guide_imports.json"
 def find_files(path):
    """Find all MDX files in the given path"""
    for root, _, files in os.walk(path):
        for file in files:
            if file.endswith(".mdx") or file.endswith(".md"):
                yield os.path.join(root, file)
 def get_full_module_name(module_path, class_name):
    """Get full module name using inspect"""
    module = importlib.import_module(module_path)
    class_ = getattr(module, class_name)
    return inspect.getmodule(class_).__name__
 def main():
    """Main function"""
    global_imports = {}
    for file in find_files(_DOCS_DIR):
        print(f"Adding links for imports in {file}")
        # replace_imports now returns the import information rather than writing it to a file
        file_imports = replace_imports(file)
        if file_imports:
            # Use relative file path as key
            relative_path = os.path.relpath(file, _DOCS_DIR)
            doc_url = f"https://python.langchain.com/docs/{relative_path.replace('.mdx', '').replace('.md', '')}"
            for import_info in file_imports:
                doc_title = import_info["title"]
                class_name = import_info["imported"]
                if class_name not in global_imports:
                    global_imports[class_name] = {}
                global_imports[class_name][doc_title] = doc_url
    # Write the global imports information to a JSON file
    with _JSON_PATH.open("w") as f:
        json.dump(global_imports, f)
 def _get_doc_title(data: str, file_name: str) -> str:
    try:
        return re.findall(r"^#\s+(.*)", data, re.MULTILINE)[0]
    except IndexError:
        pass
    # Parse the rst-style titles
    try:
        return re.findall(r"^(.*)\n=+\n", data, re.MULTILINE)[0]
    except IndexError:
        return file_name
 def replace_imports(file):
    """Replace imports in each Python code block with links to their documentation and append the import info in a comment"""
    all_imports = []
    with open(file, "r") as f:
        data = f.read()
    file_name = os.path.basename(file)
    _DOC_TITLE = _get_doc_title(data, file_name)
    def replacer(match):
        # Extract the code block content
        code = match.group(2)
        # Replace if any import comment exists
        # TODO: Use our own custom <code> component rather than this
        # injection method
        existing_comment_re = re.compile(r"^<!--IMPORTS:.*?-->\n", re.MULTILINE)
        code = existing_comment_re.sub("", code)
        # Process imports in the code block
        imports = []
        for import_match in _IMPORT_RE.finditer(code):
            class_name = import_match.group(4)
            try:
                module_path = get_full_module_name(import_match.group(2), class_name)
            except AttributeError as e:
                logger.warning(f"Could not find module for {class_name}, {e}")
                continue
            except ImportError as e:
                # Some CentOS OpenSSL issues can cause this to fail
                logger.warning(f"Failed to load for class {class_name}, {e}")
                continue
            url = (
                _BASE_URL
                + "/"
                + module_path.split(".")[1]
                + "/"
                + module_path
                + "."
                + class_name
                + ".html"
            )
            # Add the import information to our list
            imports.append(
                {
                    "imported": class_name,
                    "source": import_match.group(2),
                    "docs": url,
                    "title": _DOC_TITLE,
                }
            )
        if imports:
            all_imports.extend(imports)
            # Create a unique comment containing the import information
            import_comment = f"<!--IMPORTS:{json.dumps(imports)}-->"
            # Inject the import comment at the start of the code block
            return match.group(1) + import_comment + "\n" + code + match.group(3)
        else:
            # If there are no imports, return the original match
            return match.group(0)
    # Use re.sub to replace each Python code block
    data = code_block_re.sub(replacer, data)
    with open(file, "w") as f:
        f.write(data)
    return all_imports
 if __name__ == "__main__":
    main()
--- a/docs/docs_skeleton/src/theme/CodeBlock/index.js
+++ b/docs/docs_skeleton/src/theme/CodeBlock/index.js
@@ -21,7 +21,7 @@ function Imports({ imports }) {
      </h4>
      <ul style={{ paddingBottom: "1rem" }}>
        {imports.map(({ imported, source, docs }) => (
-          <li>
+          <li key={imported}>
            <a href={docs}>
              <span>{imported}</span>
            </a>{" "}
@@ -34,14 +34,25 @@ function Imports({ imports }) {
 }
 export default function CodeBlockWrapper({ children, ...props }) {
  // Initialize imports as an empty array
  let imports = [];
  // Check if children is a string
  if (typeof children === "string") {
-    return <CodeBlock {...props}>{children}</CodeBlock>;
+    // Search for an IMPORTS comment in the code
    const match = /<!--IMPORTS:(.*?)-->\n/.exec(children);
    if (match) {
      imports = JSON.parse(match[1]);
      children = children.replace(match[0], "");
    }
  } else if (children.imports) {
    imports = children.imports;
  }
  return (
    <>
-      <CodeBlock {...props}>{children.content}</CodeBlock>
+      <CodeBlock {...props}>{children}</CodeBlock>
-      <Imports imports={children.imports} />
+      {imports.length > 0 && <Imports imports={imports} />}
    </>
  );
 }
--- a/docs/docs_skeleton/vercel_build.sh
+++ b/docs/docs_skeleton/vercel_build.sh
@@ -1,10 +1,32 @@
 #!/bin/bash
 ### See: https://github.com/urllib3/urllib3/issues/2168
 # Requests lib breaks for old SSL versions,
 # which are defaults on Amazon Linux 2 (which Vercel uses for builds)
 yum -y update
 yum remove openssl-devel -y
 yum install gcc bzip2-devel libffi-devel zlib-devel wget tar -y
 yum install openssl11 -y
 yum install openssl11-devel -y
 # Install python 3.11 to connect with openSSL 1.1.1
 wget https://www.python.org/ftp/python/3.11.4/Python-3.11.4.tgz 
 tar xzf Python-3.11.4.tgz 
 cd Python-3.11.4 
 ./configure 
 make altinstall
 # Check python version
 echo "Python Version"
 python3.11 --version
 cd ..
-python3 --version
+###
-python3 -m venv .venv
+
 # Install nbdev and generate docs
 cd ..
 python3.11 -m venv .venv
 source .venv/bin/activate
-python3 -m pip install -r vercel_requirements.txt
+python3.11 -m pip install --upgrade pip
 python3.11 -m pip install -r vercel_requirements.txt
 cp -r extras/* docs_skeleton/docs
 cd docs_skeleton
 nbdoc_build
 python3.11 generate_api_reference_links.py
--- a/docs/extras/integrations/document_loaders/chatgpt_loader.ipynb
+++ b/docs/extras/integrations/document_loaders/chatgpt_loader.ipynb
@@ -4,7 +4,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### ChatGPT Data\n",
+    "# ChatGPT Data\n",
    "\n",
    ">[ChatGPT](https://chat.openai.com) is an artificial intelligence (AI) chatbot developed by OpenAI.\n",
    "\n",
--- a/docs/extras/integrations/llms/octoai.ipynb
+++ b/docs/extras/integrations/llms/octoai.ipynb
@@ -5,7 +5,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## OctoAI Compute Service\n",
+    "# OctoAI Compute Service\n",
    "This example goes over how to use LangChain to interact with `OctoAI` [LLM endpoints](https://octoai.cloud/templates)\n",
    "## Environment setup\n",
    "\n",
--- a/docs/extras/integrations/tools/awslambda.ipynb
+++ b/docs/extras/integrations/tools/awslambda.ipynb
@@ -5,7 +5,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## AWS Lambda API"
+    "# AWS Lambda API"
   ]
  },
  {
--- a/docs/extras/use_cases/agents/multi_modal_output_agent.ipynb
+++ b/docs/extras/use_cases/agents/multi_modal_output_agent.ipynb
@@ -5,7 +5,7 @@
   "id": "cd835d40",
   "metadata": {},
   "source": [
-    "## Multi-modal outputs: Image & Text"
+    "# Multi-modal outputs: Image & Text"
   ]
  },
  {
--- a/docs/extras/use_cases/multi_modal/image_agent.ipynb
+++ b/docs/extras/use_cases/multi_modal/image_agent.ipynb
@@ -5,7 +5,7 @@
   "id": "cd835d40",
   "metadata": {},
   "source": [
-    "## Multi-modal outputs: Image & Text"
+    "# Multi-modal outputs: Image & Text"
   ]
  },
  {
--- a/docs/vercel_requirements.txt
+++ b/docs/vercel_requirements.txt
@@ -1 +1,2 @@
 -e ../libs/langchain
 nbdoc