Wfh/ref links (#8454)

This commit is contained in:
William FH
2023-07-29 08:44:32 -07:00
committed by GitHub
parent 13b4f465e2
commit b7c0eb9ecb
23 changed files with 189 additions and 379 deletions

View File

@@ -5,16 +5,22 @@ import logging
import os
import re
from pathlib import Path
import argparse
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Base URL for all class documentation
_BASE_URL = "https://api.python.langchain.com/en/latest/"
_BASE_URL = "https://api.python.langchain.com/en/latest"
# Regular expression to match Python code blocks
code_block_re = re.compile(r"^(```python\n)(.*?)(```\n)", re.DOTALL | re.MULTILINE)
# Regular expression to match langchain import lines
_IMPORT_RE = re.compile(r"(from\s+(langchain\.\w+(\.\w+)*?)\s+import\s+)(\w+)")
_IMPORT_RE = re.compile(
r"from\s+(langchain\.\w+(\.\w+)*?)\s+import\s+"
r"((?:\w+(?:,\s*)?)*" # Match zero or more words separated by a comma+optional ws
r"(?:\s*\(.*?\))?)", # Match optional parentheses block
re.DOTALL, # Match newlines as well
)
_CURRENT_PATH = Path(__file__).parent.absolute()
# Directory where generated markdown files are stored
@@ -24,6 +30,10 @@ _JSON_PATH = _CURRENT_PATH.parent / "api_reference" / "guide_imports.json"
def find_files(path):
"""Find all MDX files in the given path"""
# Check if is file first
if os.path.isfile(path):
yield path
return
for root, _, files in os.walk(path):
for file in files:
if file.endswith(".mdx") or file.endswith(".md"):
@@ -37,20 +47,33 @@ def get_full_module_name(module_path, class_name):
return inspect.getmodule(class_).__name__
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--docs_dir",
type=str,
default=_DOCS_DIR,
help="Directory where generated markdown files are stored",
)
return parser.parse_args()
def main():
"""Main function"""
args = get_args()
global_imports = {}
for file in find_files(_DOCS_DIR):
for file in find_files(args.docs_dir):
print(f"Adding links for imports in {file}")
# replace_imports now returns the import information rather than writing it to a file
file_imports = replace_imports(file)
if file_imports:
# Use relative file path as key
relative_path = os.path.relpath(file, _DOCS_DIR)
doc_url = f"https://python.langchain.com/docs/{relative_path.replace('.mdx', '').replace('.md', '')}"
relative_path = (
os.path.relpath(file, _DOCS_DIR).replace(".mdx", "").replace(".md", "")
)
doc_url = f"https://python.langchain.com/docs/{relative_path}"
for import_info in file_imports:
doc_title = import_info["title"]
class_name = import_info["imported"]
@@ -77,7 +100,8 @@ def _get_doc_title(data: str, file_name: str) -> str:
def replace_imports(file):
"""Replace imports in each Python code block with links to their documentation and append the import info in a comment"""
"""Replace imports in each Python code block with links to their
documentation and append the import info in a comment"""
all_imports = []
with open(file, "r") as f:
data = f.read()
@@ -97,37 +121,45 @@ def replace_imports(file):
# Process imports in the code block
imports = []
for import_match in _IMPORT_RE.finditer(code):
class_name = import_match.group(4)
try:
module_path = get_full_module_name(import_match.group(2), class_name)
except AttributeError as e:
logger.warning(f"Could not find module for {class_name}, {e}")
continue
except ImportError as e:
# Some CentOS OpenSSL issues can cause this to fail
logger.warning(f"Failed to load for class {class_name}, {e}")
continue
module = import_match.group(1)
imports_str = (
import_match.group(3).replace("(\n", "").replace("\n)", "")
) # Handle newlines within parentheses
# remove any newline and spaces, then split by comma
imported_classes = [
imp.strip()
for imp in re.split(r",\s*", imports_str.replace("\n", ""))
if imp.strip()
]
for class_name in imported_classes:
try:
module_path = get_full_module_name(module, class_name)
except AttributeError as e:
logger.warning(f"Could not find module for {class_name}, {e}")
continue
except ImportError as e:
logger.warning(f"Failed to load for class {class_name}, {e}")
continue
url = (
_BASE_URL
+ "/"
+ module_path.split(".")[1]
+ "/"
+ module_path
+ "."
+ class_name
+ ".html"
)
url = (
_BASE_URL
+ module_path.split(".")[1]
+ "/"
+ module_path
+ "."
+ class_name
+ ".html"
)
# Add the import information to our list
imports.append(
{
"imported": class_name,
"source": import_match.group(2),
"docs": url,
"title": _DOC_TITLE,
}
)
# Add the import information to our list
imports.append(
{
"imported": class_name,
"source": module,
"docs": url,
"title": _DOC_TITLE,
}
)
if imports:
all_imports.extend(imports)