[docs]: standardize doc loader doc strings (#25325)

2025-09-26 13:59:49 +00:00 · 2024-08-13 16:18:56 -07:00
parent e0bbb81d04
commit f4ffd692a3
8 changed files with 345 additions and 70 deletions
--- a/docs/docs/integrations/document_loaders/firecrawl.ipynb
+++ b/docs/docs/integrations/document_loaders/firecrawl.ipynb
--- a/docs/docs/integrations/document_loaders/pypdfloader.ipynb
+++ b/docs/docs/integrations/document_loaders/pypdfloader.ipynb
@@ -122,7 +122,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
@@ -131,21 +131,41 @@
       "6"
      ]
     },
-     "execution_count": 5,
+     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
-    "page = []\n",
+    "pages = []\n",
    "for doc in loader.lazy_load():\n",
-    "    page.append(doc)\n",
-    "    if len(page) >= 10:\n",
+    "    pages.append(doc)\n",
+    "    if len(pages) >= 10:\n",
    "        # do some paged operation, e.g.\n",
    "        # index.upsert(page)\n",
    "\n",
-    "        page = []\n",
-    "len(page)"
+    "        pages = []\n",
+    "len(pages)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LayoutParser : A Uniﬁed Toolkit for DL-Based DIA 11\n",
+      "focuses on precision, eﬃciency, and robustness. \n",
+      "{'source': './example_data/layout-parser-paper.pdf', 'page': 10}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(pages[0].page_content[:100])\n",
+    "print(pages[0].metadata)"
   ]
  },
  {
--- a/docs/docs/integrations/document_loaders/recursive_url.ipynb
+++ b/docs/docs/integrations/document_loaders/recursive_url.ipynb
@@ -229,14 +229,14 @@
    }
   ],
   "source": [
-    "page = []\n",
+    "pages = []\n",
    "for doc in loader.lazy_load():\n",
-    "    page.append(doc)\n",
-    "    if len(page) >= 10:\n",
+    "    pages.append(doc)\n",
+    "    if len(pages) >= 10:\n",
    "        # do some paged operation, e.g.\n",
    "        # index.upsert(page)\n",
    "\n",
-    "        page = []"
+    "        pages = []"
   ]
  },
  {
--- a/docs/docs/integrations/document_loaders/web_base.ipynb
+++ b/docs/docs/integrations/document_loaders/web_base.ipynb
--- a/libs/community/langchain_community/document_loaders/firecrawl.py
+++ b/libs/community/langchain_community/document_loaders/firecrawl.py
@@ -6,11 +6,63 @@ from langchain_core.utils import get_from_env


 class FireCrawlLoader(BaseLoader):
-    """Load web pages as Documents using FireCrawl.
-
-    Must have Python package `firecrawl` installed and a FireCrawl API key. See
-        https://www.firecrawl.dev/ for more.
    """
+    FireCrawlLoader document loader integration
+
+    Setup:
+        Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.
+
+        .. code-block:: bash
+
+            pip install -U firecrawl-py langchain_community
+            export FIRECRAWL_API_KEY="your-api-key"
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import FireCrawlLoader
+
+            loader = FireCrawlLoader(
+                url = "https://firecrawl.dev",
+                mode = "crawl"
+                # other params = ...
+            )
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
+             Join the waitlist to turn any web
+            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
+
+
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
+             Join the waitlist to turn any web
+            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
+
+    """  # noqa: E501

    def __init__(
        self,
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -161,10 +161,66 @@ class OnlinePDFLoader(BasePDFLoader):


 class PyPDFLoader(BasePDFLoader):
-    """Load PDF using pypdf into list of documents.
-
-    Loader chunks by page and stores page numbers in metadata.
    """
+    PyPDFLoader document loader integration
+
+    Setup:
+        Install ``langchain-community``.
+
+        .. code-block:: bash
+
+            pip install -U langchain-community
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import PyPDFLoader
+
+            loader = PyPDFLoader(
+                file_path = "./example_data/layout-parser-paper.pdf",
+                password = "my-pasword",
+                extract_images = True,
+                # headers = None
+                # extraction_mode = "plain",
+                # extraction_kwargs = None,
+            )
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            LayoutParser : A Uniﬁed Toolkit for Deep
+            Learning Based Document Image Analysis
+            Zejiang Shen1( ), R
+            {'source': './example_data/layout-parser-paper.pdf', 'page': 0}
+
+    # TODO: Delete if async load is not implemented
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            LayoutParser : A Uniﬁed Toolkit for Deep
+            Learning Based Document Image Analysis
+            Zejiang Shen1( ), R
+            {'source': './example_data/layout-parser-paper.pdf', 'page': 0}
+    """  # noqa: E501

    def __init__(
        self,
--- a/libs/community/langchain_community/document_loaders/web_base.py
+++ b/libs/community/langchain_community/document_loaders/web_base.py
@@ -39,7 +39,75 @@ def _build_metadata(soup: Any, url: str) -> dict:


 class WebBaseLoader(BaseLoader):
-    """Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
+    """
+    WebBaseLoader document loader integration
+
+    Setup:
+        Install ``langchain_community``.
+
+        .. code-block:: bash
+
+            pip install -U langchain_community
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import WebBaseLoader
+
+            loader = WebBaseLoader(
+                web_path = "https://www.espn.com/"
+                # header_template = None,
+                # verify_ssl = True,
+                # proxies = None,
+                # continue_on_failure = False,
+                # autoset_encoding = True,
+                # encoding = None,
+                # web_paths = (),
+                # requests_per_second = 2,
+                # default_parser = "html.parser",
+                # requests_kwargs = None,
+                # raise_for_status = False,
+                # bs_get_text_kwargs = None,
+                # bs_kwargs = None,
+                # session = None,
+                # show_progress = True,
+            )
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            ESPN - Serving Sports Fans. Anytime. Anywhere.
+
+            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
+
+
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            ESPN - Serving Sports Fans. Anytime. Anywhere.
+
+            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
+
+    """  # noqa: E501

    def __init__(
        self,
--- a/libs/partners/unstructured/langchain_unstructured/document_loaders.py
+++ b/libs/partners/unstructured/langchain_unstructured/document_loaders.py
@@ -24,29 +24,9 @@ _DEFAULT_URL = "https://api.unstructuredapp.io/general/v0/general"
 class UnstructuredLoader(BaseLoader):
    """Unstructured document loader interface.

-    Partition and load files using either the `unstructured-client` sdk and the
-    Unstructured API or locally using the `unstructured` library.
-
-    API:
-    This package is configured to work with the Unstructured API by default.
-    To use the Unstructured API, set
-    `partition_via_api=True` and define `api_key`. If you are running the unstructured
-    API locally, you can change the API rule by defining `url` when you initialize the
-    loader. The hosted Unstructured API requires an API key. See the links below to
-    learn more about our API offerings and get an API key.
-
-    Local:
-    To partition files locally, you must have the `unstructured` package installed.
-    You can install it with `pip install unstructured`.
-    By default the file loader uses the Unstructured `partition` function and will
-    automatically detect the file type.
-
-    In addition to document specific partition parameters, Unstructured has a rich set
-    of "chunking" parameters for post-processing elements into more useful text segments
-    for uses cases such as Retrieval Augmented Generation (RAG). You can pass additional
-    Unstructured kwargs to the loader to configure different unstructured settings.
-
    Setup:
+        Install ``langchain-unstructured`` and set environment variable ``UNSTRUCTURED_API_KEY``.
+
        .. code-block:: bash
            pip install -U langchain-unstructured
            export UNSTRUCTURED_API_KEY="your-api-key"
@@ -63,20 +43,46 @@ class UnstructuredLoader(BaseLoader):
                strategy="fast",
            )

-    Load:
+    Lazy load:
        .. code-block:: python
-            docs = loader.load()

+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
            print(docs[0].page_content[:100])
            print(docs[0].metadata)

+        .. code-block:: python
+
+            1 2 0 2
+            {'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
+
+
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            1 2 0 2
+            {'source': './example_data/layout-parser-paper.pdf', 'coordinates': {'points': ((16.34, 213.36), (16.34, 253.36), (36.34, 253.36), (36.34, 213.36)), 'system': 'PixelSpace', 'layout_width': 612, 'layout_height': 792}, 'file_directory': './example_data', 'filename': 'layout-parser-paper.pdf', 'languages': ['eng'], 'last_modified': '2024-07-25T21:28:58', 'page_number': 1, 'filetype': 'application/pdf', 'category': 'UncategorizedText', 'element_id': 'd3ce55f220dfb75891b4394a18bcb973'}
+
+
    References
    ----------
    https://docs.unstructured.io/api-reference/api-services/sdk
    https://docs.unstructured.io/api-reference/api-services/overview
    https://docs.unstructured.io/open-source/core-functionality/partitioning
    https://docs.unstructured.io/open-source/core-functionality/chunking
-    """
+    """  # noqa: E501

    def __init__(
        self,