[docs]: standardize doc loader doc strings (#25325)

2025-09-16 15:04:13 +00:00 · 2024-08-13 16:18:56 -07:00
parent e0bbb81d04
commit f4ffd692a3
8 changed files with 345 additions and 70 deletions
--- a/libs/community/langchain_community/document_loaders/firecrawl.py
+++ b/libs/community/langchain_community/document_loaders/firecrawl.py
@@ -6,11 +6,63 @@ from langchain_core.utils import get_from_env


 class FireCrawlLoader(BaseLoader):
-    """Load web pages as Documents using FireCrawl.
-
-    Must have Python package `firecrawl` installed and a FireCrawl API key. See
-        https://www.firecrawl.dev/ for more.
    """
+    FireCrawlLoader document loader integration
+
+    Setup:
+        Install ``firecrawl-py``,``langchain_community`` and set environment variable ``FIRECRAWL_API_KEY``.
+
+        .. code-block:: bash
+
+            pip install -U firecrawl-py langchain_community
+            export FIRECRAWL_API_KEY="your-api-key"
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import FireCrawlLoader
+
+            loader = FireCrawlLoader(
+                url = "https://firecrawl.dev",
+                mode = "crawl"
+                # other params = ...
+            )
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
+             Join the waitlist to turn any web
+            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
+
+
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            Introducing [Smart Crawl!](https://www.firecrawl.dev/smart-crawl)
+             Join the waitlist to turn any web
+            {'ogUrl': 'https://www.firecrawl.dev/', 'title': 'Home - Firecrawl', 'robots': 'follow, index', 'ogImage': 'https://www.firecrawl.dev/og.png?123', 'ogTitle': 'Firecrawl', 'sitemap': {'lastmod': '2024-08-12T00:28:16.681Z', 'changefreq': 'weekly'}, 'keywords': 'Firecrawl,Markdown,Data,Mendable,Langchain', 'sourceURL': 'https://www.firecrawl.dev/', 'ogSiteName': 'Firecrawl', 'description': 'Firecrawl crawls and converts any website into clean markdown.', 'ogDescription': 'Turn any website into LLM-ready data.', 'pageStatusCode': 200, 'ogLocaleAlternate': []}
+
+    """  # noqa: E501

    def __init__(
        self,
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -161,10 +161,66 @@ class OnlinePDFLoader(BasePDFLoader):


 class PyPDFLoader(BasePDFLoader):
-    """Load PDF using pypdf into list of documents.
-
-    Loader chunks by page and stores page numbers in metadata.
    """
+    PyPDFLoader document loader integration
+
+    Setup:
+        Install ``langchain-community``.
+
+        .. code-block:: bash
+
+            pip install -U langchain-community
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import PyPDFLoader
+
+            loader = PyPDFLoader(
+                file_path = "./example_data/layout-parser-paper.pdf",
+                password = "my-pasword",
+                extract_images = True,
+                # headers = None
+                # extraction_mode = "plain",
+                # extraction_kwargs = None,
+            )
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            LayoutParser : A Uniﬁed Toolkit for Deep
+            Learning Based Document Image Analysis
+            Zejiang Shen1( ), R
+            {'source': './example_data/layout-parser-paper.pdf', 'page': 0}
+
+    # TODO: Delete if async load is not implemented
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            LayoutParser : A Uniﬁed Toolkit for Deep
+            Learning Based Document Image Analysis
+            Zejiang Shen1( ), R
+            {'source': './example_data/layout-parser-paper.pdf', 'page': 0}
+    """  # noqa: E501

    def __init__(
        self,
--- a/libs/community/langchain_community/document_loaders/web_base.py
+++ b/libs/community/langchain_community/document_loaders/web_base.py
@@ -39,7 +39,75 @@ def _build_metadata(soup: Any, url: str) -> dict:


 class WebBaseLoader(BaseLoader):
-    """Load HTML pages using `urllib` and parse them with `BeautifulSoup'."""
+    """
+    WebBaseLoader document loader integration
+
+    Setup:
+        Install ``langchain_community``.
+
+        .. code-block:: bash
+
+            pip install -U langchain_community
+
+    Instantiate:
+        .. code-block:: python
+
+            from langchain_community.document_loaders import WebBaseLoader
+
+            loader = WebBaseLoader(
+                web_path = "https://www.espn.com/"
+                # header_template = None,
+                # verify_ssl = True,
+                # proxies = None,
+                # continue_on_failure = False,
+                # autoset_encoding = True,
+                # encoding = None,
+                # web_paths = (),
+                # requests_per_second = 2,
+                # default_parser = "html.parser",
+                # requests_kwargs = None,
+                # raise_for_status = False,
+                # bs_get_text_kwargs = None,
+                # bs_kwargs = None,
+                # session = None,
+                # show_progress = True,
+            )
+
+    Lazy load:
+        .. code-block:: python
+
+            docs = []
+            docs_lazy = loader.lazy_load()
+
+            # async variant:
+            # docs_lazy = await loader.alazy_load()
+
+            for doc in docs_lazy:
+                docs.append(doc)
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            ESPN - Serving Sports Fans. Anytime. Anywhere.
+
+            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
+
+
+    Async load:
+        .. code-block:: python
+
+            docs = await loader.aload()
+            print(docs[0].page_content[:100])
+            print(docs[0].metadata)
+
+        .. code-block:: python
+
+            ESPN - Serving Sports Fans. Anytime. Anywhere.
+
+            {'source': 'https://www.espn.com/', 'title': 'ESPN - Serving Sports Fans. Anytime. Anywhere.', 'description': 'Visit ESPN for live scores, highlights and sports news. Stream exclusive games on ESPN+ and play fantasy sports.', 'language': 'en'}
+
+    """  # noqa: E501

    def __init__(
        self,