mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-05 04:55:14 +00:00
docs: arxiv
page, added cookbooks (#22215)
Issue: The `arXiv` page is missing the arxiv paper references from the `langchain/cookbook`. PR: Added the cookbook references. Result: `Found 29 arXiv references in the 3 docs, 21 API Refs, 5 Templates, and 18 Cookbooks.` - much more references are visible now.
This commit is contained in:
@@ -7,7 +7,7 @@ import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Set
|
||||
from typing import Any, Dict
|
||||
|
||||
from pydantic.v1 import BaseModel, root_validator
|
||||
|
||||
@@ -17,6 +17,7 @@ _ROOT_DIR = Path(os.path.abspath(__file__)).parents[2]
|
||||
DOCS_DIR = _ROOT_DIR / "docs" / "docs"
|
||||
CODE_DIR = _ROOT_DIR / "libs"
|
||||
TEMPLATES_DIR = _ROOT_DIR / "templates"
|
||||
COOKBOOKS_DIR = _ROOT_DIR / "cookbook"
|
||||
ARXIV_ID_PATTERN = r"https://arxiv\.org/(abs|pdf)/(\d+\.\d+)"
|
||||
LANGCHAIN_PYTHON_URL = "python.langchain.com"
|
||||
|
||||
@@ -29,6 +30,7 @@ class ArxivPaper:
|
||||
referencing_doc2url: dict[str, str]
|
||||
referencing_api_ref2url: dict[str, str]
|
||||
referencing_template2url: dict[str, str]
|
||||
referencing_cookbook2url: dict[str, str]
|
||||
title: str
|
||||
authors: list[str]
|
||||
abstract: str
|
||||
@@ -50,7 +52,6 @@ def search_documentation_for_arxiv_references(docs_dir: Path) -> dict[str, set[s
|
||||
arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
|
||||
exclude_strings = {"file_path", "metadata", "link", "loader", "PyPDFLoader"}
|
||||
|
||||
# loop all the files (ipynb, mdx, md) in the docs folder
|
||||
files = (
|
||||
p.resolve()
|
||||
for p in Path(docs_dir).glob("**/*")
|
||||
@@ -76,39 +77,6 @@ def search_documentation_for_arxiv_references(docs_dir: Path) -> dict[str, set[s
|
||||
return arxiv_id2file_names
|
||||
|
||||
|
||||
def convert_module_name_and_members_to_urls(
|
||||
arxiv_id2module_name_and_members: dict[str, set[str]],
|
||||
) -> dict[str, set[str]]:
|
||||
arxiv_id2urls = {}
|
||||
for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items():
|
||||
urls = set()
|
||||
for module_name_and_member in module_name_and_members:
|
||||
module_name, type_and_member = module_name_and_member.split(":")
|
||||
if "$" in type_and_member:
|
||||
type, member = type_and_member.split("$")
|
||||
else:
|
||||
type = type_and_member
|
||||
member = ""
|
||||
_namespace_parts = module_name.split(".")
|
||||
if type == "module":
|
||||
first_namespace_part = _namespace_parts[0]
|
||||
if first_namespace_part.startswith("langchain_"):
|
||||
first_namespace_part = first_namespace_part.replace(
|
||||
"langchain_", ""
|
||||
)
|
||||
url = f"{first_namespace_part}_api_reference.html#module-{module_name}"
|
||||
elif type in ["class", "function"]:
|
||||
second_namespace_part = _namespace_parts[1]
|
||||
url = f"{second_namespace_part}/{module_name}.{member}.html#{module_name}.{member}"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown type: {type} in the {module_name_and_member}."
|
||||
)
|
||||
urls.add(url)
|
||||
arxiv_id2urls[arxiv_id] = urls
|
||||
return arxiv_id2urls
|
||||
|
||||
|
||||
def search_code_for_arxiv_references(code_dir: Path) -> dict[str, set[str]]:
|
||||
"""Search the code for arXiv references.
|
||||
|
||||
@@ -220,7 +188,6 @@ def search_code_for_arxiv_references(code_dir: Path) -> dict[str, set[str]]:
|
||||
|
||||
def search_templates_for_arxiv_references(templates_dir: Path) -> dict[str, set[str]]:
|
||||
arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
|
||||
# exclude_strings = {"file_path", "metadata", "link", "loader", "PyPDFLoader"}
|
||||
|
||||
# loop all the Readme.md files since they are parsed into LangChain documentation
|
||||
# exclude the Readme.md in the root folder
|
||||
@@ -234,8 +201,6 @@ def search_templates_for_arxiv_references(templates_dir: Path) -> dict[str, set[
|
||||
with open(file, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
# if any(exclude_string in line for exclude_string in exclude_strings):
|
||||
# continue
|
||||
matches = arxiv_url_pattern.search(line)
|
||||
if matches:
|
||||
arxiv_id = matches.group(2)
|
||||
@@ -247,6 +212,58 @@ def search_templates_for_arxiv_references(templates_dir: Path) -> dict[str, set[
|
||||
return arxiv_id2template_names
|
||||
|
||||
|
||||
def search_cookbooks_for_arxiv_references(cookbooks_dir: Path) -> dict[str, set[str]]:
|
||||
arxiv_url_pattern = re.compile(ARXIV_ID_PATTERN)
|
||||
files = (p.resolve() for p in Path(cookbooks_dir).glob("**/*.ipynb"))
|
||||
arxiv_id2cookbook_names: dict[str, set[str]] = {}
|
||||
for file in files:
|
||||
with open(file, "r", encoding="utf-8") as f:
|
||||
lines = f.readlines()
|
||||
for line in lines:
|
||||
matches = arxiv_url_pattern.search(line)
|
||||
if matches:
|
||||
arxiv_id = matches.group(2)
|
||||
cookbook_name = file.stem
|
||||
if arxiv_id not in arxiv_id2cookbook_names:
|
||||
arxiv_id2cookbook_names[arxiv_id] = {cookbook_name}
|
||||
else:
|
||||
arxiv_id2cookbook_names[arxiv_id].add(cookbook_name)
|
||||
return arxiv_id2cookbook_names
|
||||
|
||||
|
||||
def convert_module_name_and_members_to_urls(
|
||||
arxiv_id2module_name_and_members: dict[str, set[str]],
|
||||
) -> dict[str, set[str]]:
|
||||
arxiv_id2urls = {}
|
||||
for arxiv_id, module_name_and_members in arxiv_id2module_name_and_members.items():
|
||||
urls = set()
|
||||
for module_name_and_member in module_name_and_members:
|
||||
module_name, type_and_member = module_name_and_member.split(":")
|
||||
if "$" in type_and_member:
|
||||
type_, member = type_and_member.split("$")
|
||||
else:
|
||||
type_ = type_and_member
|
||||
member = ""
|
||||
_namespace_parts = module_name.split(".")
|
||||
if type_ == "module":
|
||||
first_namespace_part = _namespace_parts[0]
|
||||
if first_namespace_part.startswith("langchain_"):
|
||||
first_namespace_part = first_namespace_part.replace(
|
||||
"langchain_", ""
|
||||
)
|
||||
url = f"{first_namespace_part}_api_reference.html#module-{module_name}"
|
||||
elif type_ in ["class", "function"]:
|
||||
second_namespace_part = _namespace_parts[1]
|
||||
url = f"{second_namespace_part}/{module_name}.{member}.html#{module_name}.{member}"
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown type: {type_} in the {module_name_and_member}."
|
||||
)
|
||||
urls.add(url)
|
||||
arxiv_id2urls[arxiv_id] = urls
|
||||
return arxiv_id2urls
|
||||
|
||||
|
||||
def _get_doc_path(file_parts: tuple[str, ...], file_extension) -> str:
|
||||
"""Get the relative path to the documentation page
|
||||
from the absolute path of the file.
|
||||
@@ -285,60 +302,6 @@ def _get_module_name(file_parts: tuple[str, ...]) -> str:
|
||||
return ".".join(ns_parts)
|
||||
|
||||
|
||||
def compound_urls(
|
||||
arxiv_id2file_names: dict[str, set[str]],
|
||||
arxiv_id2code_urls: dict[str, set[str]],
|
||||
arxiv_id2templates: dict[str, set[str]],
|
||||
) -> dict[str, dict[str, set[str]]]:
|
||||
# format urls and verify that the urls are correct
|
||||
arxiv_id2file_names_new = {}
|
||||
for arxiv_id, file_names in arxiv_id2file_names.items():
|
||||
key2urls = {
|
||||
key: _format_doc_url(key)
|
||||
for key in file_names
|
||||
if _is_url_ok(_format_doc_url(key))
|
||||
}
|
||||
if key2urls:
|
||||
arxiv_id2file_names_new[arxiv_id] = key2urls
|
||||
|
||||
arxiv_id2code_urls_new = {}
|
||||
for arxiv_id, code_urls in arxiv_id2code_urls.items():
|
||||
key2urls = {
|
||||
key: _format_api_ref_url(key)
|
||||
for key in code_urls
|
||||
if _is_url_ok(_format_api_ref_url(key))
|
||||
}
|
||||
if key2urls:
|
||||
arxiv_id2code_urls_new[arxiv_id] = key2urls
|
||||
|
||||
arxiv_id2templates_new = {}
|
||||
for arxiv_id, templates in arxiv_id2templates.items():
|
||||
key2urls = {
|
||||
key: _format_template_url(key)
|
||||
for key in templates
|
||||
if _is_url_ok(_format_template_url(key))
|
||||
}
|
||||
if key2urls:
|
||||
arxiv_id2templates_new[arxiv_id] = key2urls
|
||||
|
||||
arxiv_id2type2key2urls = dict.fromkeys(
|
||||
arxiv_id2file_names_new | arxiv_id2code_urls_new | arxiv_id2templates_new
|
||||
)
|
||||
arxiv_id2type2key2urls = {k: {} for k in arxiv_id2type2key2urls}
|
||||
for arxiv_id, key2urls in arxiv_id2file_names_new.items():
|
||||
arxiv_id2type2key2urls[arxiv_id]["docs"] = key2urls
|
||||
for arxiv_id, key2urls in arxiv_id2code_urls_new.items():
|
||||
arxiv_id2type2key2urls[arxiv_id]["apis"] = key2urls
|
||||
for arxiv_id, key2urls in arxiv_id2templates_new.items():
|
||||
arxiv_id2type2key2urls[arxiv_id]["templates"] = key2urls
|
||||
|
||||
# reverse sort by the arxiv_id (the newest papers first)
|
||||
ret = dict(
|
||||
sorted(arxiv_id2type2key2urls.items(), key=lambda item: item[0], reverse=True)
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def _is_url_ok(url: str) -> bool:
|
||||
"""Check if the url page is open without error."""
|
||||
import requests
|
||||
@@ -424,6 +387,9 @@ class ArxivAPIWrapper(BaseModel):
|
||||
referencing_template2url=type2key2urls["templates"]
|
||||
if "templates" in type2key2urls
|
||||
else {},
|
||||
referencing_cookbook2url=type2key2urls["cookbooks"]
|
||||
if "cookbooks" in type2key2urls
|
||||
else {},
|
||||
)
|
||||
for result, type2key2urls in zip(results, arxiv_id2type2key2urls.values())
|
||||
]
|
||||
@@ -443,6 +409,10 @@ def _format_template_url(template_name: str) -> str:
|
||||
return f"https://{LANGCHAIN_PYTHON_URL}/docs/templates/{template_name}"
|
||||
|
||||
|
||||
def _format_cookbook_url(cookbook_name: str) -> str:
|
||||
return f"https://github.com/langchain-ai/langchain/blob/master/cookbook/{cookbook_name}.ipynb"
|
||||
|
||||
|
||||
def _compact_module_full_name(doc_path: str) -> str:
|
||||
# agents/langchain_core.agents.AgentAction.html#langchain_core.agents.AgentAction
|
||||
module = doc_path.split("#")[1].replace("module-", "")
|
||||
@@ -454,9 +424,79 @@ def _compact_module_full_name(doc_path: str) -> str:
|
||||
return module
|
||||
|
||||
|
||||
def compound_urls(
|
||||
arxiv_id2file_names: dict[str, set[str]],
|
||||
arxiv_id2code_urls: dict[str, set[str]],
|
||||
arxiv_id2templates: dict[str, set[str]],
|
||||
arxiv_id2cookbooks: dict[str, set[str]],
|
||||
) -> dict[str, dict[str, set[str]]]:
|
||||
# format urls and verify that the urls are correct
|
||||
arxiv_id2file_names_new = {}
|
||||
for arxiv_id, file_names in arxiv_id2file_names.items():
|
||||
key2urls = {
|
||||
key: _format_doc_url(key)
|
||||
for key in file_names
|
||||
if _is_url_ok(_format_doc_url(key))
|
||||
}
|
||||
if key2urls:
|
||||
arxiv_id2file_names_new[arxiv_id] = key2urls
|
||||
|
||||
arxiv_id2code_urls_new = {}
|
||||
for arxiv_id, code_urls in arxiv_id2code_urls.items():
|
||||
key2urls = {
|
||||
key: _format_api_ref_url(key)
|
||||
for key in code_urls
|
||||
if _is_url_ok(_format_api_ref_url(key))
|
||||
}
|
||||
if key2urls:
|
||||
arxiv_id2code_urls_new[arxiv_id] = key2urls
|
||||
|
||||
arxiv_id2templates_new = {}
|
||||
for arxiv_id, templates in arxiv_id2templates.items():
|
||||
key2urls = {
|
||||
key: _format_template_url(key)
|
||||
for key in templates
|
||||
if _is_url_ok(_format_template_url(key))
|
||||
}
|
||||
if key2urls:
|
||||
arxiv_id2templates_new[arxiv_id] = key2urls
|
||||
|
||||
arxiv_id2cookbooks_new = {}
|
||||
for arxiv_id, cookbooks in arxiv_id2cookbooks.items():
|
||||
key2urls = {
|
||||
key: _format_cookbook_url(key)
|
||||
for key in cookbooks
|
||||
if _is_url_ok(_format_cookbook_url(key))
|
||||
}
|
||||
if key2urls:
|
||||
arxiv_id2cookbooks_new[arxiv_id] = key2urls
|
||||
|
||||
arxiv_id2type2key2urls = dict.fromkeys(
|
||||
arxiv_id2file_names_new
|
||||
| arxiv_id2code_urls_new
|
||||
| arxiv_id2templates_new
|
||||
| arxiv_id2cookbooks_new
|
||||
)
|
||||
arxiv_id2type2key2urls = {k: {} for k in arxiv_id2type2key2urls}
|
||||
for arxiv_id, key2urls in arxiv_id2file_names_new.items():
|
||||
arxiv_id2type2key2urls[arxiv_id]["docs"] = key2urls
|
||||
for arxiv_id, key2urls in arxiv_id2code_urls_new.items():
|
||||
arxiv_id2type2key2urls[arxiv_id]["apis"] = key2urls
|
||||
for arxiv_id, key2urls in arxiv_id2templates_new.items():
|
||||
arxiv_id2type2key2urls[arxiv_id]["templates"] = key2urls
|
||||
for arxiv_id, key2urls in arxiv_id2cookbooks_new.items():
|
||||
arxiv_id2type2key2urls[arxiv_id]["cookbooks"] = key2urls
|
||||
|
||||
# reverse sort by the arxiv_id (the newest papers first)
|
||||
ret = dict(
|
||||
sorted(arxiv_id2type2key2urls.items(), key=lambda item: item[0], reverse=True)
|
||||
)
|
||||
return ret
|
||||
|
||||
|
||||
def log_results(arxiv_id2type2key2urls):
|
||||
arxiv_ids = arxiv_id2type2key2urls.keys()
|
||||
doc_number, api_number, templates_number = 0, 0, 0
|
||||
doc_number, api_number, templates_number, cookbooks_number = 0, 0, 0, 0
|
||||
for type2key2url in arxiv_id2type2key2urls.values():
|
||||
if "docs" in type2key2url:
|
||||
doc_number += len(type2key2url["docs"])
|
||||
@@ -464,9 +504,11 @@ def log_results(arxiv_id2type2key2urls):
|
||||
api_number += len(type2key2url["apis"])
|
||||
if "templates" in type2key2url:
|
||||
templates_number += len(type2key2url["templates"])
|
||||
if "cookbooks" in type2key2url:
|
||||
cookbooks_number += len(type2key2url["cookbooks"])
|
||||
logger.warning(
|
||||
f"Found {len(arxiv_ids)} arXiv references in the {doc_number} docs, {api_number} API Refs,"
|
||||
f" and {templates_number} Templates."
|
||||
f" {templates_number} Templates, and {cookbooks_number} Cookbooks."
|
||||
)
|
||||
|
||||
|
||||
@@ -477,7 +519,7 @@ def generate_arxiv_references_page(file_name: Path, papers: list[ArxivPaper]) ->
|
||||
|
||||
LangChain implements the latest research in the field of Natural Language Processing.
|
||||
This page contains `arXiv` papers referenced in the LangChain Documentation, API Reference,
|
||||
and Templates.
|
||||
Templates, and Cookbooks.
|
||||
|
||||
## Summary
|
||||
|
||||
@@ -510,6 +552,14 @@ and Templates.
|
||||
for key, url in paper.referencing_template2url.items()
|
||||
)
|
||||
]
|
||||
if paper.referencing_cookbook2url:
|
||||
refs += [
|
||||
"`Cookbook:` "
|
||||
+ ", ".join(
|
||||
f"[{key}]({url})"
|
||||
for key, url in paper.referencing_cookbook2url.items()
|
||||
)
|
||||
]
|
||||
refs_str = ", ".join(refs)
|
||||
|
||||
title_link = f"[{paper.title}]({paper.url})"
|
||||
@@ -533,8 +583,17 @@ and Templates.
|
||||
if paper.referencing_template2url
|
||||
else ""
|
||||
)
|
||||
cookbook_refs = (
|
||||
f" - **Cookbook:** {', '.join(f'[{key}]({url})' for key, url in paper.referencing_cookbook2url.items())}"
|
||||
if paper.referencing_cookbook2url
|
||||
else ""
|
||||
)
|
||||
refs = "\n".join(
|
||||
[el for el in [docs_refs, api_ref_refs, template_refs] if el]
|
||||
[
|
||||
el
|
||||
for el in [docs_refs, api_ref_refs, template_refs, cookbook_refs]
|
||||
if el
|
||||
]
|
||||
)
|
||||
f.write(f"""
|
||||
## {paper.title}
|
||||
@@ -562,8 +621,9 @@ def main():
|
||||
)
|
||||
arxiv_id2file_names = search_documentation_for_arxiv_references(DOCS_DIR)
|
||||
arxiv_id2templates = search_templates_for_arxiv_references(TEMPLATES_DIR)
|
||||
arxiv_id2cookbooks = search_cookbooks_for_arxiv_references(COOKBOOKS_DIR)
|
||||
arxiv_id2type2key2urls = compound_urls(
|
||||
arxiv_id2file_names, arxiv_id2code_urls, arxiv_id2templates
|
||||
arxiv_id2file_names, arxiv_id2code_urls, arxiv_id2templates, arxiv_id2cookbooks
|
||||
)
|
||||
log_results(arxiv_id2type2key2urls)
|
||||
|
||||
|
Reference in New Issue
Block a user