From 75e966a2faf331ee61fb730ec76685f3ae81ffac Mon Sep 17 00:00:00 2001 From: Isaac Francisco <78627776+isahers1@users.noreply.github.com> Date: Thu, 13 Jun 2024 19:28:57 -0700 Subject: [PATCH] docs, cli[patch]: document loaders doc template (#22862) From: https://github.com/langchain-ai/langchain/pull/22290 --------- Co-authored-by: Eugene Yurtsev --- .../docs/document_loaders.ipynb | 210 ++++++++++++++++++ .../integration_template/document_loaders.py | 71 ++++++ .../langchain_cli/namespaces/integration.py | 7 +- .../document_loaders/recursive_url_loader.py | 38 +--- 4 files changed, 295 insertions(+), 31 deletions(-) create mode 100644 libs/cli/langchain_cli/integration_template/docs/document_loaders.ipynb create mode 100644 libs/cli/langchain_cli/integration_template/integration_template/document_loaders.py diff --git a/libs/cli/langchain_cli/integration_template/docs/document_loaders.ipynb b/libs/cli/langchain_cli/integration_template/docs/document_loaders.ipynb new file mode 100644 index 00000000000..1cc6d7d6a72 --- /dev/null +++ b/libs/cli/langchain_cli/integration_template/docs/document_loaders.ipynb @@ -0,0 +1,210 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: __ModuleName__\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# __ModuleName__Loader\n", + "\n", + "- TODO: Make sure API reference link is correct.\n", + "\n", + "This notebook provides a quick overview for getting started with __ModuleName__ [document loader](/docs/integrations/document_loaders/). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.__module_name___loader.__ModuleName__Loader.html).\n", + "\n", + "- TODO: Add any other relevant links, like information about underlying API, etc.\n", + "\n", + "## Overview\n", + "### Integration details\n", + "\n", + "- TODO: Fill in table features.\n", + "- TODO: Remove JS support link if not relevant, otherwise ensure link is correct.\n", + "- TODO: Make sure API reference links are correct.\n", + "\n", + "| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/web_loaders/__module_name___loader)|\n", + "| :--- | :--- | :---: | :---: | :---: |\n", + "| [__ModuleName__Loader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.__module_name__loader.__ModuleName__Loader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅/❌ | beta/❌ | ✅/❌ | \n", + "### Loader features\n", + "| Source | Document Lazy Loading | Async Support\n", + "| :---: | :---: | :---: | \n", + "| __ModuleName__Loader | ✅/❌ | ✅/❌ | \n", + "\n", + "## Setup\n", + "\n", + "- TODO: Update with relevant info.\n", + "\n", + "To access __ModuleName__ document loader you'll need to install the `__package_name__` integration package, and create a **ModuleName** account and get an API key.\n", + "\n", + "### Credentials\n", + "\n", + "- TODO: Update with relevant info.\n", + "\n", + "Head to (TODO: link) to sign up to __ModuleName__ and generate an API key. Once you've done this set the __MODULE_NAME___API_KEY environment variable:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import getpass\n", + "import os\n", + "\n", + "os.environ[\"__MODULE_NAME___API_KEY\"] = getpass.getpass(\"Enter your __ModuleName__ API key: \")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n", + "# os.environ[\"LANGSMITH_TRACING\"] = \"true\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Installation\n", + "\n", + "Install **langchain_community**.\n", + "\n", + "- TODO: Add any other required packages" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install -qU langchain_community" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Instantiation\n", + "\n", + "Now we can instantiate our model object and load documents:\n", + "\n", + "- TODO: Update model instantiation with relevant params." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import __ModuleName__Loader\n", + "\n", + "loader = __ModuleName__Loader(\n", + " # required params = ...\n", + " # optional params = ...\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load\n", + "\n", + "- TODO: Run cells to show loading capabilities" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lazy Load\n", + "\n", + "- TODO: Run cells to show lazy loading capabilities. Delete if lazy loading is not implemented." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "page = []\n", + "for doc in loader.lazy_load():\n", + " page.append(doc)\n", + " if len(page) >= 10:\n", + " # do some paged operation, e.g.\n", + " # index.upsert(page)\n", + "\n", + " page = []" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## TODO: Any functionality specific to this document loader\n", + "\n", + "E.g. using specific configs for different loading behavior. Delete if not relevant." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## API reference\n", + "\n", + "For detailed documentation of all __ModuleName__Loader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.__module_name___loader.__ModuleName__Loader.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/cli/langchain_cli/integration_template/integration_template/document_loaders.py b/libs/cli/langchain_cli/integration_template/integration_template/document_loaders.py new file mode 100644 index 00000000000..ecf044f71e2 --- /dev/null +++ b/libs/cli/langchain_cli/integration_template/integration_template/document_loaders.py @@ -0,0 +1,71 @@ +"""__ModuleName__ document loader.""" + +from typing import Iterator +from langchain_core.document_loaders.base import BaseLoader +from langchain_core.documents import Document + + +class __ModuleName__Loader(BaseLoader): + # TODO: Replace all TODOs in docstring. See example docstring: + # https://github.com/langchain-ai/langchain/blob/869523ad728e6b76d77f170cce13925b4ebc3c1e/libs/community/langchain_community/document_loaders/recursive_url_loader.py#L54 + """ + __ModuleName__ document loader integration + + # TODO: Replace with relevant packages, env vars. + Setup: + Install ``__package_name__`` and set environment variable ``__MODULE_NAME___API_KEY``. + + .. code-block:: bash + + pip install -U __package_name__ + export __MODULE_NAME___API_KEY="your-api-key" + + # TODO: Replace with relevant init params. + Instantiate: + .. code-block:: python + + from langchain_community.document_loaders import __ModuleName__Loader + + loader = __ModuleName__Loader( + # required params = ... + # other params = ... + ) + + Lazy load: + .. code-block:: python + + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + TODO: Example output + + # TODO: Delete if async load is not implemented + Async load: + .. code-block:: python + + docs = await loader.aload() + print(docs[0].page_content[:100]) + print(docs[0].metadata) + + .. code-block:: python + + TODO: Example output + """ + + # TODO: This method must be implemented to load documents. + # Do not implement load(), a default implementation is already available. + def lazy_load(self) -> Iterator[Document]: + raise NotImplementedError() + + # TODO: Implement if you would like to change default BaseLoader implementation + # async def alazy_load(self) -> AsyncIterator[Document]: \ No newline at end of file diff --git a/libs/cli/langchain_cli/namespaces/integration.py b/libs/cli/langchain_cli/namespaces/integration.py index b19ac7ce2a5..1d9280cd27c 100644 --- a/libs/cli/langchain_cli/namespaces/integration.py +++ b/libs/cli/langchain_cli/namespaces/integration.py @@ -153,7 +153,7 @@ def create_doc( component_type: Annotated[ str, typer.Option( - help=("The type of component. Currently only 'ChatModel' supported."), + help=("The type of component. Currently only 'ChatModel', 'DocumentLoader' supported."), ), ] = "ChatModel", destination_dir: Annotated[ @@ -196,7 +196,10 @@ def create_doc( ) # copy over template from ../integration_template - docs_template = Path(__file__).parents[1] / "integration_template/docs/chat.ipynb" + if component_type == "ChatModel": + docs_template = Path(__file__).parents[1] / "integration_template/docs/chat.ipynb" + elif component_type == "DocumentLoader": + docs_template = Path(__file__).parents[1] / "integration_template/docs/document_loaders.ipynb" shutil.copy(docs_template, destination_path) # replacements in file diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py index 62e7352e3d5..b5d5047cd4f 100644 --- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py +++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py @@ -110,14 +110,17 @@ class RecursiveUrlLoader(BaseLoader): # ... ) - Load: - Use ``.load()`` to synchronously load into memory all Documents, with one - Document per visited URL. Starting from the initial URL, we recurse through - all linked URLs up to the specified max_depth. - + Lazy load: .. code-block:: python - docs = loader.load() + docs = [] + docs_lazy = loader.lazy_load() + + # async variant: + # docs_lazy = await loader.alazy_load() + + for doc in docs_lazy: + docs.append(doc) print(docs[0].page_content[:100]) print(docs[0].metadata) @@ -146,29 +149,6 @@ class RecursiveUrlLoader(BaseLoader): < {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None} - Lazy load: - .. code-block:: python - - docs = [] - docs_lazy = loader.lazy_load() - - # async variant: - # docs_lazy = await loader.alazy_load() - - for doc in docs_lazy: - docs.append(doc) - print(docs[0].page_content[:100]) - print(docs[0].metadata) - - .. code-block:: python - - - - - - < - {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None} - Content parsing / extraction: By default the loader sets the raw HTML from each link as the Document page content. To parse this HTML into a more human/LLM-friendly format you can pass