Wfh/ref links (#8454)

2025-09-03 03:59:42 +00:00 · 2023-07-29 08:44:32 -07:00
parent 13b4f465e2
commit b7c0eb9ecb
23 changed files with 189 additions and 379 deletions
--- a/docs/docs_skeleton/generate_api_reference_links.py
+++ b/docs/docs_skeleton/generate_api_reference_links.py
@@ -5,16 +5,22 @@ import logging
 import os
 import re
 from pathlib import Path
+import argparse

 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 # Base URL for all class documentation
-_BASE_URL = "https://api.python.langchain.com/en/latest/"
+_BASE_URL = "https://api.python.langchain.com/en/latest"

 # Regular expression to match Python code blocks
 code_block_re = re.compile(r"^(```python\n)(.*?)(```\n)", re.DOTALL | re.MULTILINE)
 # Regular expression to match langchain import lines
-_IMPORT_RE = re.compile(r"(from\s+(langchain\.\w+(\.\w+)*?)\s+import\s+)(\w+)")
+_IMPORT_RE = re.compile(
+    r"from\s+(langchain\.\w+(\.\w+)*?)\s+import\s+"
+    r"((?:\w+(?:,\s*)?)*"  # Match zero or more words separated by a comma+optional ws
+    r"(?:\s*\(.*?\))?)",  # Match optional parentheses block
+    re.DOTALL,  # Match newlines as well
+)

 _CURRENT_PATH = Path(__file__).parent.absolute()
 # Directory where generated markdown files are stored
@@ -24,6 +30,10 @@ _JSON_PATH = _CURRENT_PATH.parent / "api_reference" / "guide_imports.json"

 def find_files(path):
    """Find all MDX files in the given path"""
+    # Check if is file first
+    if os.path.isfile(path):
+        yield path
+        return
    for root, _, files in os.walk(path):
        for file in files:
            if file.endswith(".mdx") or file.endswith(".md"):
@@ -37,20 +47,33 @@ def get_full_module_name(module_path, class_name):
    return inspect.getmodule(class_).__name__


+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--docs_dir",
+        type=str,
+        default=_DOCS_DIR,
+        help="Directory where generated markdown files are stored",
+    )
+    return parser.parse_args()
+
+
 def main():
    """Main function"""
+    args = get_args()
    global_imports = {}

-    for file in find_files(_DOCS_DIR):
+    for file in find_files(args.docs_dir):
        print(f"Adding links for imports in {file}")
-
-        # replace_imports now returns the import information rather than writing it to a file
        file_imports = replace_imports(file)

        if file_imports:
            # Use relative file path as key
-            relative_path = os.path.relpath(file, _DOCS_DIR)
-            doc_url = f"https://python.langchain.com/docs/{relative_path.replace('.mdx', '').replace('.md', '')}"
+            relative_path = (
+                os.path.relpath(file, _DOCS_DIR).replace(".mdx", "").replace(".md", "")
+            )
+
+            doc_url = f"https://python.langchain.com/docs/{relative_path}"
            for import_info in file_imports:
                doc_title = import_info["title"]
                class_name = import_info["imported"]
@@ -77,7 +100,8 @@ def _get_doc_title(data: str, file_name: str) -> str:


 def replace_imports(file):
-    """Replace imports in each Python code block with links to their documentation and append the import info in a comment"""
+    """Replace imports in each Python code block with links to their
+    documentation and append the import info in a comment"""
    all_imports = []
    with open(file, "r") as f:
        data = f.read()
@@ -97,37 +121,45 @@ def replace_imports(file):
        # Process imports in the code block
        imports = []
        for import_match in _IMPORT_RE.finditer(code):
-            class_name = import_match.group(4)
-            try:
-                module_path = get_full_module_name(import_match.group(2), class_name)
-            except AttributeError as e:
-                logger.warning(f"Could not find module for {class_name}, {e}")
-                continue
-            except ImportError as e:
-                # Some CentOS OpenSSL issues can cause this to fail
-                logger.warning(f"Failed to load for class {class_name}, {e}")
-                continue
+            module = import_match.group(1)
+            imports_str = (
+                import_match.group(3).replace("(\n", "").replace("\n)", "")
+            )  # Handle newlines within parentheses
+            # remove any newline and spaces, then split by comma
+            imported_classes = [
+                imp.strip()
+                for imp in re.split(r",\s*", imports_str.replace("\n", ""))
+                if imp.strip()
+            ]
+            for class_name in imported_classes:
+                try:
+                    module_path = get_full_module_name(module, class_name)
+                except AttributeError as e:
+                    logger.warning(f"Could not find module for {class_name}, {e}")
+                    continue
+                except ImportError as e:
+                    logger.warning(f"Failed to load for class {class_name}, {e}")
+                    continue

-            url = (
-                _BASE_URL
-                + "/"
-                + module_path.split(".")[1]
-                + "/"
-                + module_path
-                + "."
-                + class_name
-                + ".html"
-            )
+                url = (
+                    _BASE_URL
+                    + module_path.split(".")[1]
+                    + "/"
+                    + module_path
+                    + "."
+                    + class_name
+                    + ".html"
+                )

-            # Add the import information to our list
-            imports.append(
-                {
-                    "imported": class_name,
-                    "source": import_match.group(2),
-                    "docs": url,
-                    "title": _DOC_TITLE,
-                }
-            )
+                # Add the import information to our list
+                imports.append(
+                    {
+                        "imported": class_name,
+                        "source": module,
+                        "docs": url,
+                        "title": _DOC_TITLE,
+                    }
+                )

        if imports:
            all_imports.extend(imports)