import itertools import multiprocessing import re import sys from pathlib import Path from typing import Optional # List of 4-tuples (integration_name, display_name, concept_page, how_to_fragment) INTEGRATION_INFO = [ ("chat", "Chat model", "chat_models", "chat-models"), ("llms", "LLM", "text_llms", "llms"), ("text_embedding", "Embedding model", "embedding_models", "embedding-models"), ("document_loaders", "Document loader", "document_loaders", "document-loaders"), ("vectorstores", "Vector store", "vectorstores", "vector-stores"), ("retrievers", "Retriever", "retrievers", "retrievers"), ("tools", "Tool", "tools", "tools"), # stores is a special case because there are no key-value store how-tos yet # this is a placeholder for when we do have them # for now the related links section will only contain the conceptual guide. ("stores", "Key-value store", "key_value_stores", "key-value-stores"), ] # Create a dictionary with key being the first element (integration_name) and value being the rest of the tuple INTEGRATION_INFO_DICT = { integration_name: rest for integration_name, *rest in INTEGRATION_INFO } RELATED_LINKS_SECTION = """## Related - {concept_display_name} [conceptual guide](/docs/concepts/{concept_page}) - {concept_display_name} [how-to guides](/docs/how_to/#{how_to_fragment}) """ RELATED_LINKS_WITHOUT_HOW_TO_SECTION = """## Related - {concept_display_name} [conceptual guide](/docs/concepts/{concept_page}) """ def _generate_related_links_section( integration_type: str, notebook_name: str ) -> Optional[str]: if integration_type not in INTEGRATION_INFO_DICT: return None concept_display_name, concept_page, how_to_fragment = INTEGRATION_INFO_DICT[ integration_type ] # Special case because there are no key-value store how-tos yet if integration_type == "stores": return RELATED_LINKS_WITHOUT_HOW_TO_SECTION.format( concept_display_name=concept_display_name, concept_page=concept_page, ) return RELATED_LINKS_SECTION.format( concept_display_name=concept_display_name, concept_page=concept_page, how_to_fragment=how_to_fragment, ) def _process_path(doc_path: Path): content = doc_path.read_text() pattern = r"/docs/integrations/([^/]+)/([^/]+).mdx?" match = re.search(pattern, str(doc_path)) if match and match.group(2) != "index": integration_type = match.group(1) notebook_name = match.group(2) related_links_section = _generate_related_links_section( integration_type, notebook_name ) if related_links_section: content = content + "\n\n" + related_links_section doc_path.write_text(content) if __name__ == "__main__": output_docs_dir = Path(sys.argv[1]) mds = output_docs_dir.rglob("integrations/**/*.md") mdxs = output_docs_dir.rglob("integrations/**/*.mdx") paths = itertools.chain(mds, mdxs) # modify all md files in place with multiprocessing.Pool() as pool: pool.map(_process_path, paths)