Add api cross ref linking (#8275)

Example of how it would show up in our python docs: ![image](https://github.com/langchain-ai/langchain/assets/13333726/0f0a88cc-ba4a-4778-bc47-118c66807f15) Examples added to the reference docs: https://api.python.langchain.com/en/wfh-api_crosslink/vectorstores/langchain.vectorstores.chroma.Chroma.html#langchain.vectorstores.chroma.Chroma ![image](https://github.com/langchain-ai/langchain/assets/13333726/dcd150de-cb56-4d42-b49a-a76a002a5a52)
2025-09-05 04:55:14 +00:00 · 2023-07-26 12:38:58 -07:00
parent a612800ef0
commit 01a9b06400
16 changed files with 263 additions and 28 deletions
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@@ -20,3 +20,5 @@ jobs:
        uses: actions/checkout@v3
      - name: Codespell
        uses: codespell-project/actions-codespell@v2
+        with:
+          skip: guide_imports.json
--- a/docs/api_reference/conf.py
+++ b/docs/api_reference/conf.py
@@ -7,20 +7,66 @@

 # -- Path setup --------------------------------------------------------------

+import json
+import os
+import sys
+from pathlib import Path
+
+import toml
+from docutils import nodes
+from sphinx.util.docutils import SphinxDirective
+
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import sys
-
-import toml

+_DIR = Path(__file__).parent.absolute()
 sys.path.insert(0, os.path.abspath("."))
 sys.path.insert(0, os.path.abspath("../../libs/langchain"))

-with open("../../libs/langchain/pyproject.toml") as f:
+with (_DIR.parents[1] / "libs" / "langchain" / "pyproject.toml").open("r") as f:
    data = toml.load(f)
+with (_DIR / "guide_imports.json").open("r") as f:
+    imported_classes = json.load(f)
+
+
+class ExampleLinksDirective(SphinxDirective):
+    """Directive to generate a list of links to examples.
+
+    We have a script that extracts links to API reference docs
+    from our notebook examples. This directive uses that information
+    to backlink to the examples from the API reference docs."""
+
+    has_content = False
+    required_arguments = 1
+
+    def run(self):
+        """Run the directive.
+
+        Called any time :example_links:`ClassName` is used
+        in the template *.rst files."""
+        class_or_func_name = self.arguments[0]
+        links = imported_classes.get(class_or_func_name, {})
+        list_node = nodes.bullet_list()
+        for doc_name, link in links.items():
+            item_node = nodes.list_item()
+            para_node = nodes.paragraph()
+            link_node = nodes.reference()
+            link_node["refuri"] = link
+            link_node.append(nodes.Text(doc_name))
+            para_node.append(link_node)
+            item_node.append(para_node)
+            list_node.append(item_node)
+        if list_node.children:
+            title_node = nodes.title()
+            title_node.append(nodes.Text(f"Examples using {class_or_func_name}"))
+            return [title_node, list_node]
+        return [list_node]
+
+
+def setup(app):
+    app.add_directive("example_links", ExampleLinksDirective)
+

 # -- Project information -----------------------------------------------------

--- a/docs/api_reference/create_api_rst.py
+++ b/docs/api_reference/create_api_rst.py
@@ -78,6 +78,7 @@ Functions

 .. autosummary::
    :toctree: {module}
+    :template: function.rst

    {fstring}

--- a/docs/api_reference/guide_imports.json
+++ b/docs/api_reference/guide_imports.json
--- a/docs/api_reference/modules/evaluation.rst
+++ b/docs/api_reference/modules/evaluation.rst
@@ -1,9 +0,0 @@
-Evaluation
-=======================
-
-LangChain has a number of convenient evaluation chains you can use off the shelf to grade your models' oupputs.
-
-.. automodule:: langchain.evaluation
-   :members:
-   :undoc-members:
-   :inherited-members:
--- a/docs/api_reference/templates/class.rst
+++ b/docs/api_reference/templates/class.rst
@@ -26,3 +26,5 @@
   {%- endfor %}
   {% endif %}
   {% endblock %}
+
+.. example_links:: {{ objname }}
--- a/docs/api_reference/templates/function.rst
+++ b/docs/api_reference/templates/function.rst
@@ -0,0 +1,8 @@
+:mod:`{{module}}`.{{objname}}
+{{ underline }}==============
+
+.. currentmodule:: {{ module }}
+
+.. autofunction:: {{ objname }}
+
+.. example_links:: {{ objname }}
--- a/docs/docs_skeleton/generate_api_reference_links.py
+++ b/docs/docs_skeleton/generate_api_reference_links.py
@@ -0,0 +1,150 @@
+import importlib
+import inspect
+import json
+import logging
+import os
+import re
+from pathlib import Path
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Base URL for all class documentation
+_BASE_URL = "https://api.python.langchain.com/en/latest/"
+
+# Regular expression to match Python code blocks
+code_block_re = re.compile(r"^(```python\n)(.*?)(```\n)", re.DOTALL | re.MULTILINE)
+# Regular expression to match langchain import lines
+_IMPORT_RE = re.compile(r"(from\s+(langchain\.\w+(\.\w+)*?)\s+import\s+)(\w+)")
+
+_CURRENT_PATH = Path(__file__).parent.absolute()
+# Directory where generated markdown files are stored
+_DOCS_DIR = _CURRENT_PATH / "docs"
+_JSON_PATH = _CURRENT_PATH.parent / "api_reference" / "guide_imports.json"
+
+
+def find_files(path):
+    """Find all MDX files in the given path"""
+    for root, _, files in os.walk(path):
+        for file in files:
+            if file.endswith(".mdx") or file.endswith(".md"):
+                yield os.path.join(root, file)
+
+
+def get_full_module_name(module_path, class_name):
+    """Get full module name using inspect"""
+    module = importlib.import_module(module_path)
+    class_ = getattr(module, class_name)
+    return inspect.getmodule(class_).__name__
+
+
+def main():
+    """Main function"""
+    global_imports = {}
+
+    for file in find_files(_DOCS_DIR):
+        print(f"Adding links for imports in {file}")
+
+        # replace_imports now returns the import information rather than writing it to a file
+        file_imports = replace_imports(file)
+
+        if file_imports:
+            # Use relative file path as key
+            relative_path = os.path.relpath(file, _DOCS_DIR)
+            doc_url = f"https://python.langchain.com/docs/{relative_path.replace('.mdx', '').replace('.md', '')}"
+            for import_info in file_imports:
+                doc_title = import_info["title"]
+                class_name = import_info["imported"]
+                if class_name not in global_imports:
+                    global_imports[class_name] = {}
+                global_imports[class_name][doc_title] = doc_url
+
+    # Write the global imports information to a JSON file
+    with _JSON_PATH.open("w") as f:
+        json.dump(global_imports, f)
+
+
+def _get_doc_title(data: str, file_name: str) -> str:
+    try:
+        return re.findall(r"^#\s+(.*)", data, re.MULTILINE)[0]
+    except IndexError:
+        pass
+    # Parse the rst-style titles
+    try:
+        return re.findall(r"^(.*)\n=+\n", data, re.MULTILINE)[0]
+    except IndexError:
+        return file_name
+
+
+def replace_imports(file):
+    """Replace imports in each Python code block with links to their documentation and append the import info in a comment"""
+    all_imports = []
+    with open(file, "r") as f:
+        data = f.read()
+
+    file_name = os.path.basename(file)
+    _DOC_TITLE = _get_doc_title(data, file_name)
+
+    def replacer(match):
+        # Extract the code block content
+        code = match.group(2)
+        # Replace if any import comment exists
+        # TODO: Use our own custom <code> component rather than this
+        # injection method
+        existing_comment_re = re.compile(r"^<!--IMPORTS:.*?-->\n", re.MULTILINE)
+        code = existing_comment_re.sub("", code)
+
+        # Process imports in the code block
+        imports = []
+        for import_match in _IMPORT_RE.finditer(code):
+            class_name = import_match.group(4)
+            try:
+                module_path = get_full_module_name(import_match.group(2), class_name)
+            except AttributeError as e:
+                logger.warning(f"Could not find module for {class_name}, {e}")
+                continue
+            except ImportError as e:
+                # Some CentOS OpenSSL issues can cause this to fail
+                logger.warning(f"Failed to load for class {class_name}, {e}")
+                continue
+
+            url = (
+                _BASE_URL
+                + "/"
+                + module_path.split(".")[1]
+                + "/"
+                + module_path
+                + "."
+                + class_name
+                + ".html"
+            )
+
+            # Add the import information to our list
+            imports.append(
+                {
+                    "imported": class_name,
+                    "source": import_match.group(2),
+                    "docs": url,
+                    "title": _DOC_TITLE,
+                }
+            )
+
+        if imports:
+            all_imports.extend(imports)
+            # Create a unique comment containing the import information
+            import_comment = f"<!--IMPORTS:{json.dumps(imports)}-->"
+            # Inject the import comment at the start of the code block
+            return match.group(1) + import_comment + "\n" + code + match.group(3)
+        else:
+            # If there are no imports, return the original match
+            return match.group(0)
+
+    # Use re.sub to replace each Python code block
+    data = code_block_re.sub(replacer, data)
+
+    with open(file, "w") as f:
+        f.write(data)
+    return all_imports
+
+
+if __name__ == "__main__":
+    main()
--- a/docs/docs_skeleton/src/theme/CodeBlock/index.js
+++ b/docs/docs_skeleton/src/theme/CodeBlock/index.js
@@ -21,7 +21,7 @@ function Imports({ imports }) {
      </h4>
      <ul style={{ paddingBottom: "1rem" }}>
        {imports.map(({ imported, source, docs }) => (
-          <li>
+          <li key={imported}>
            <a href={docs}>
              <span>{imported}</span>
            </a>{" "}
@@ -34,14 +34,25 @@ function Imports({ imports }) {
 }

 export default function CodeBlockWrapper({ children, ...props }) {
+  // Initialize imports as an empty array
+  let imports = [];
+
+  // Check if children is a string
  if (typeof children === "string") {
-    return <CodeBlock {...props}>{children}</CodeBlock>;
+    // Search for an IMPORTS comment in the code
+    const match = /<!--IMPORTS:(.*?)-->\n/.exec(children);
+    if (match) {
+      imports = JSON.parse(match[1]);
+      children = children.replace(match[0], "");
+    }
+  } else if (children.imports) {
+    imports = children.imports;
  }

  return (
    <>
-      <CodeBlock {...props}>{children.content}</CodeBlock>
-      <Imports imports={children.imports} />
+      <CodeBlock {...props}>{children}</CodeBlock>
+      {imports.length > 0 && <Imports imports={imports} />}
    </>
  );
 }
--- a/docs/docs_skeleton/vercel_build.sh
+++ b/docs/docs_skeleton/vercel_build.sh
@@ -1,10 +1,32 @@
 #!/bin/bash

+### See: https://github.com/urllib3/urllib3/issues/2168
+# Requests lib breaks for old SSL versions,
+# which are defaults on Amazon Linux 2 (which Vercel uses for builds)
+yum -y update
+yum remove openssl-devel -y
+yum install gcc bzip2-devel libffi-devel zlib-devel wget tar -y
+yum install openssl11 -y
+yum install openssl11-devel -y
+# Install python 3.11 to connect with openSSL 1.1.1
+wget https://www.python.org/ftp/python/3.11.4/Python-3.11.4.tgz 
+tar xzf Python-3.11.4.tgz 
+cd Python-3.11.4 
+./configure 
+make altinstall
+# Check python version
+echo "Python Version"
+python3.11 --version
 cd ..
-python3 --version
-python3 -m venv .venv
+###
+
+# Install nbdev and generate docs
+cd ..
+python3.11 -m venv .venv
 source .venv/bin/activate
-python3 -m pip install -r vercel_requirements.txt
+python3.11 -m pip install --upgrade pip
+python3.11 -m pip install -r vercel_requirements.txt
 cp -r extras/* docs_skeleton/docs
 cd docs_skeleton
 nbdoc_build
+python3.11 generate_api_reference_links.py
--- a/docs/extras/integrations/document_loaders/chatgpt_loader.ipynb
+++ b/docs/extras/integrations/document_loaders/chatgpt_loader.ipynb
@@ -4,7 +4,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "### ChatGPT Data\n",
+    "# ChatGPT Data\n",
    "\n",
    ">[ChatGPT](https://chat.openai.com) is an artificial intelligence (AI) chatbot developed by OpenAI.\n",
    "\n",
--- a/docs/extras/integrations/llms/octoai.ipynb
+++ b/docs/extras/integrations/llms/octoai.ipynb
@@ -5,7 +5,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## OctoAI Compute Service\n",
+    "# OctoAI Compute Service\n",
    "This example goes over how to use LangChain to interact with `OctoAI` [LLM endpoints](https://octoai.cloud/templates)\n",
    "## Environment setup\n",
    "\n",
--- a/docs/extras/integrations/tools/awslambda.ipynb
+++ b/docs/extras/integrations/tools/awslambda.ipynb
@@ -5,7 +5,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## AWS Lambda API"
+    "# AWS Lambda API"
   ]
  },
  {
--- a/docs/extras/use_cases/agents/multi_modal_output_agent.ipynb
+++ b/docs/extras/use_cases/agents/multi_modal_output_agent.ipynb
@@ -5,7 +5,7 @@
   "id": "cd835d40",
   "metadata": {},
   "source": [
-    "## Multi-modal outputs: Image & Text"
+    "# Multi-modal outputs: Image & Text"
   ]
  },
  {
--- a/docs/extras/use_cases/multi_modal/image_agent.ipynb
+++ b/docs/extras/use_cases/multi_modal/image_agent.ipynb
@@ -5,7 +5,7 @@
   "id": "cd835d40",
   "metadata": {},
   "source": [
-    "## Multi-modal outputs: Image & Text"
+    "# Multi-modal outputs: Image & Text"
   ]
  },
  {
--- a/docs/vercel_requirements.txt
+++ b/docs/vercel_requirements.txt
@@ -1 +1,2 @@
+-e ../libs/langchain
 nbdoc