From 75e966a2faf331ee61fb730ec76685f3ae81ffac Mon Sep 17 00:00:00 2001
From: Isaac Francisco <78627776+isahers1@users.noreply.github.com>
Date: Thu, 13 Jun 2024 19:28:57 -0700
Subject: [PATCH] docs, cli[patch]: document loaders doc template (#22862)

From: https://github.com/langchain-ai/langchain/pull/22290

---------

Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
---
 .../docs/document_loaders.ipynb               | 210 ++++++++++++++++++
 .../integration_template/document_loaders.py  |  71 ++++++
 .../langchain_cli/namespaces/integration.py   |   7 +-
 .../document_loaders/recursive_url_loader.py  |  38 +---
 4 files changed, 295 insertions(+), 31 deletions(-)
 create mode 100644 libs/cli/langchain_cli/integration_template/docs/document_loaders.ipynb
 create mode 100644 libs/cli/langchain_cli/integration_template/integration_template/document_loaders.py

diff --git a/libs/cli/langchain_cli/integration_template/docs/document_loaders.ipynb b/libs/cli/langchain_cli/integration_template/docs/document_loaders.ipynb
new file mode 100644
index 00000000000..1cc6d7d6a72
--- /dev/null
+++ b/libs/cli/langchain_cli/integration_template/docs/document_loaders.ipynb
@@ -0,0 +1,210 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "sidebar_label: __ModuleName__\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# __ModuleName__Loader\n",
+    "\n",
+    "- TODO: Make sure API reference link is correct.\n",
+    "\n",
+    "This notebook provides a quick overview for getting started with __ModuleName__ [document loader](/docs/integrations/document_loaders/). For detailed documentation of all __ModuleName__Loader features and configurations head to the [API reference](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.__module_name___loader.__ModuleName__Loader.html).\n",
+    "\n",
+    "- TODO: Add any other relevant links, like information about underlying API, etc.\n",
+    "\n",
+    "## Overview\n",
+    "### Integration details\n",
+    "\n",
+    "- TODO: Fill in table features.\n",
+    "- TODO: Remove JS support link if not relevant, otherwise ensure link is correct.\n",
+    "- TODO: Make sure API reference links are correct.\n",
+    "\n",
+    "| Class | Package | Local | Serializable | [JS support](https://js.langchain.com/v0.2/docs/integrations/document_loaders/web_loaders/__module_name___loader)|\n",
+    "| :--- | :--- | :---: | :---: |  :---: |\n",
+    "| [__ModuleName__Loader](https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.__module_name__loader.__ModuleName__Loader.html) | [langchain_community](https://api.python.langchain.com/en/latest/community_api_reference.html) | ✅/❌ | beta/❌ | ✅/❌ | \n",
+    "### Loader features\n",
+    "| Source | Document Lazy Loading | Async Support\n",
+    "| :---: | :---: | :---: | \n",
+    "| __ModuleName__Loader | ✅/❌ | ✅/❌ | \n",
+    "\n",
+    "## Setup\n",
+    "\n",
+    "- TODO: Update with relevant info.\n",
+    "\n",
+    "To access __ModuleName__ document loader you'll need to install the `__package_name__` integration package, and create a **ModuleName** account and get an API key.\n",
+    "\n",
+    "### Credentials\n",
+    "\n",
+    "- TODO: Update with relevant info.\n",
+    "\n",
+    "Head to (TODO: link) to sign up to __ModuleName__ and generate an API key. Once you've done this set the __MODULE_NAME___API_KEY environment variable:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import getpass\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"__MODULE_NAME___API_KEY\"] = getpass.getpass(\"Enter your __ModuleName__ API key: \")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "If you want to get automated tracing of your model calls you can also set your [LangSmith](https://docs.smith.langchain.com/) API key by uncommenting below:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# os.environ[\"LANGSMITH_API_KEY\"] = getpass.getpass(\"Enter your LangSmith API key: \")\n",
+    "# os.environ[\"LANGSMITH_TRACING\"] = \"true\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Installation\n",
+    "\n",
+    "Install **langchain_community**.\n",
+    "\n",
+    "- TODO: Add any other required packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%pip install -qU langchain_community"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Instantiation\n",
+    "\n",
+    "Now we can instantiate our model object and load documents:\n",
+    "\n",
+    "- TODO: Update model instantiation with relevant params."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders import __ModuleName__Loader\n",
+    "\n",
+    "loader = __ModuleName__Loader(\n",
+    "    # required params = ...\n",
+    "    # optional params = ...\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load\n",
+    "\n",
+    "- TODO: Run cells to show loading capabilities"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()\n",
+    "docs[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(docs[0].metadata)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Lazy Load\n",
+    "\n",
+    "- TODO: Run cells to show lazy loading capabilities. Delete if lazy loading is not implemented."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "page = []\n",
+    "for doc in loader.lazy_load():\n",
+    "    page.append(doc)\n",
+    "    if len(page) >= 10:\n",
+    "        # do some paged operation, e.g.\n",
+    "        # index.upsert(page)\n",
+    "\n",
+    "        page = []"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## TODO: Any functionality specific to this document loader\n",
+    "\n",
+    "E.g. using specific configs for different loading behavior. Delete if not relevant."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## API reference\n",
+    "\n",
+    "For detailed documentation of all __ModuleName__Loader features and configurations head to the API reference: https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.__module_name___loader.__ModuleName__Loader.html"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/libs/cli/langchain_cli/integration_template/integration_template/document_loaders.py b/libs/cli/langchain_cli/integration_template/integration_template/document_loaders.py
new file mode 100644
index 00000000000..ecf044f71e2
--- /dev/null
+++ b/libs/cli/langchain_cli/integration_template/integration_template/document_loaders.py
@@ -0,0 +1,71 @@
+"""__ModuleName__ document loader."""
+
+from typing import Iterator
+from langchain_core.document_loaders.base import BaseLoader
+from langchain_core.documents import Document
+
+
+class __ModuleName__Loader(BaseLoader):
+    # TODO: Replace all TODOs in docstring. See example docstring:
+    # https://github.com/langchain-ai/langchain/blob/869523ad728e6b76d77f170cce13925b4ebc3c1e/libs/community/langchain_community/document_loaders/recursive_url_loader.py#L54
+    """
+    __ModuleName__ document loader integration
+    
+    # TODO: Replace with relevant packages, env vars.
+    Setup:
+        Install ``__package_name__`` and set environment variable ``__MODULE_NAME___API_KEY``.
+
+        .. code-block:: bash
+
+            pip install -U __package_name__
+            export __MODULE_NAME___API_KEY="your-api-key"
+
+    # TODO: Replace with relevant init params.
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import __ModuleName__Loader
+
+            loader = __ModuleName__Loader(
+                # required params = ...
+                # other params = ...
+            )
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            TODO: Example output  
+
+    # TODO: Delete if async load is not implemented
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            TODO: Example output
+    """
+
+    # TODO: This method must be implemented to load documents.
+    # Do not implement load(), a default implementation is already available.
+    def lazy_load(self) -> Iterator[Document]:
+        raise NotImplementedError()
+    
+    # TODO: Implement if you would like to change default BaseLoader implementation
+    # async def alazy_load(self) -> AsyncIterator[Document]:
\ No newline at end of file
diff --git a/libs/cli/langchain_cli/namespaces/integration.py b/libs/cli/langchain_cli/namespaces/integration.py
index b19ac7ce2a5..1d9280cd27c 100644
--- a/libs/cli/langchain_cli/namespaces/integration.py
+++ b/libs/cli/langchain_cli/namespaces/integration.py
@@ -153,7 +153,7 @@ def create_doc(
     component_type: Annotated[
         str,
         typer.Option(
-            help=("The type of component. Currently only 'ChatModel' supported."),
+            help=("The type of component. Currently only 'ChatModel', 'DocumentLoader' supported."),
         ),
     ] = "ChatModel",
     destination_dir: Annotated[
@@ -196,7 +196,10 @@ def create_doc(
     )
 
     # copy over template from ../integration_template
-    docs_template = Path(__file__).parents[1] / "integration_template/docs/chat.ipynb"
+    if component_type == "ChatModel":
+        docs_template = Path(__file__).parents[1] / "integration_template/docs/chat.ipynb"
+    elif component_type == "DocumentLoader":
+        docs_template = Path(__file__).parents[1] / "integration_template/docs/document_loaders.ipynb"
     shutil.copy(docs_template, destination_path)
 
     # replacements in file
diff --git a/libs/community/langchain_community/document_loaders/recursive_url_loader.py b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
index 62e7352e3d5..b5d5047cd4f 100644
--- a/libs/community/langchain_community/document_loaders/recursive_url_loader.py
+++ b/libs/community/langchain_community/document_loaders/recursive_url_loader.py
@@ -110,14 +110,17 @@ class RecursiveUrlLoader(BaseLoader):
                 # ...
             )
 
-    Load:
-        Use ``.load()`` to synchronously load into memory all Documents, with one
-        Document per visited URL. Starting from the initial URL, we recurse through
-        all linked URLs up to the specified max_depth.
-
+    Lazy load:
         .. code-block:: python
 
-            docs = loader.load()
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
             print(docs[0].page_content[:100])
             print(docs[0].metadata)
 
@@ -146,29 +149,6 @@ class RecursiveUrlLoader(BaseLoader):
                 <meta charset="utf-8" /><
             {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}
 
-    Lazy load:
-        .. code-block:: python
-
-            docs = []
-            docs_lazy = loader.lazy_load()
-
-            # async variant:
-            # docs_lazy = await loader.alazy_load()
-
-            for doc in docs_lazy:
-                docs.append(doc)
-            print(docs[0].page_content[:100])
-            print(docs[0].metadata)
-
-        .. code-block:: python
-
-            <!DOCTYPE html>
-
-            <html xmlns="http://www.w3.org/1999/xhtml">
-            <head>
-                <meta charset="utf-8" /><
-            {'source': 'https://docs.python.org/3.9/', 'content_type': 'text/html', 'title': '3.9.19 Documentation', 'language': None}
-
     Content parsing / extraction:
         By default the loader sets the raw HTML from each link as the Document page
         content. To parse this HTML into a more human/LLM-friendly format you can pass