docstrings document_loaders 3 (#6937)

- Updated docstrings for `document_loaders` - Mass update `"""Loader that loads` to `"""Loads` @baskaryan - please, review
2025-07-04 12:18:24 +00:00 · 2023-07-10 08:56:53 -07:00 · 2023-07-10 08:56:53 -07:00 · 5eec74d9a5
commit 5eec74d9a5
parent 9d13dcd17c
54 changed files with 316 additions and 105 deletions
--- a/langchain/document_loaders/acreom.py
+++ b/langchain/document_loaders/acreom.py
@ -1,4 +1,4 @@
-"""Loader that loads acreom vault from a directory."""
+"""Loads acreom vault from a directory."""
 import re
 from pathlib import Path
 from typing import Iterator, List
--- a/langchain/document_loaders/airbyte_json.py
+++ b/langchain/document_loaders/airbyte_json.py
@ -1,4 +1,4 @@
-"""Loader that loads local airbyte json files."""
+"""Loads local airbyte json files."""
 import json
 from typing import List
@ -8,7 +8,7 @@ from langchain.utils import stringify_dict
 class AirbyteJSONLoader(BaseLoader):
-    """Loader that loads local airbyte json files."""
+    """Loads local airbyte json files."""
    def __init__(self, file_path: str):
        """Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
--- a/langchain/document_loaders/azlyrics.py
+++ b/langchain/document_loaders/azlyrics.py
@ -1,4 +1,4 @@
-"""Loader that loads AZLyrics."""
+"""Loads AZLyrics."""
 from typing import List
 from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
 class AZLyricsLoader(WebBaseLoader):
-    """Loader that loads AZLyrics webpages."""
+    """Loads AZLyrics webpages."""
    def load(self) -> List[Document]:
        """Load webpages into Documents."""
--- a/langchain/document_loaders/bilibili.py
+++ b/langchain/document_loaders/bilibili.py
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
 class BiliBiliLoader(BaseLoader):
-    """Loader that loads bilibili transcripts."""
+    """Loads bilibili transcripts."""
    def __init__(self, video_urls: List[str]):
        """Initialize with bilibili url.
--- a/langchain/document_loaders/blackboard.py
+++ b/langchain/document_loaders/blackboard.py
@ -1,4 +1,4 @@
-"""Loader that loads all documents from a blackboard course."""
+"""Loads all documents from a blackboard course."""
 import contextlib
 import re
 from pathlib import Path
--- a/langchain/document_loaders/chatgpt.py
+++ b/langchain/document_loaders/chatgpt.py
@ -1,3 +1,4 @@
 """Load conversations from ChatGPT data export"""
 import datetime
 import json
 from typing import List
@ -31,7 +32,7 @@ class ChatGPTLoader(BaseLoader):
    """Load conversations from exported ChatGPT data."""
    def __init__(self, log_file: str, num_logs: int = -1):
-        """
+        """Initialize a class object.
        Args:
            log_file: Path to the log file
--- a/langchain/document_loaders/college_confidential.py
+++ b/langchain/document_loaders/college_confidential.py
@ -1,4 +1,4 @@
-"""Loader that loads College Confidential."""
+"""Loads College Confidential."""
 from typing import List
 from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
 class CollegeConfidentialLoader(WebBaseLoader):
-    """Loader that loads College Confidential webpages."""
+    """Loads College Confidential webpages."""
    def load(self) -> List[Document]:
        """Load webpages as Documents."""
--- a/langchain/document_loaders/epub.py
+++ b/langchain/document_loaders/epub.py
@ -1,4 +1,4 @@
-"""Loader that loads EPub files."""
+"""Loads EPub files."""
 from typing import List
 from langchain.document_loaders.unstructured import (
--- a/langchain/document_loaders/excel.py
+++ b/langchain/document_loaders/excel.py
@ -1,4 +1,4 @@
-"""Loader that loads Microsoft Excel files."""
+"""Loads Microsoft Excel files."""
 from typing import Any, List
 from langchain.document_loaders.unstructured import (
--- a/langchain/document_loaders/facebook_chat.py
+++ b/langchain/document_loaders/facebook_chat.py
@ -1,4 +1,4 @@
-"""Loader that loads Facebook chat json dump."""
+"""Loads Facebook chat json dump."""
 import datetime
 import json
 from pathlib import Path
--- a/langchain/document_loaders/figma.py
+++ b/langchain/document_loaders/figma.py
@ -1,4 +1,4 @@
-"""Loader that loads Figma files json dump."""
+"""Loads Figma files json dump."""
 import json
 import urllib.request
 from typing import Any, List
--- a/langchain/document_loaders/gitbook.py
+++ b/langchain/document_loaders/gitbook.py
@ -1,4 +1,4 @@
-"""Loader that loads GitBook."""
+"""Loads GitBook."""
 from typing import Any, List, Optional
 from urllib.parse import urljoin, urlparse
--- a/langchain/document_loaders/googledrive.py
+++ b/langchain/document_loaders/googledrive.py
@ -1,4 +1,4 @@
-"""Loader that loads data from Google Drive."""
+"""Loads data from Google Drive."""
 # Prerequisites:
 # 1. Create a Google Cloud project
--- a/langchain/document_loaders/hn.py
+++ b/langchain/document_loaders/hn.py
@ -1,4 +1,4 @@
-"""Loader that loads Hacker News."""
+"""Loads HN."""
 from typing import Any, List
 from langchain.docstore.document import Document
--- a/langchain/document_loaders/ifixit.py
+++ b/langchain/document_loaders/ifixit.py
@ -1,4 +1,4 @@
-"""Loader that loads iFixit data."""
+"""Loads iFixit data."""
 from typing import List, Optional
 import requests
--- a/langchain/document_loaders/image.py
+++ b/langchain/document_loaders/image.py
@ -1,4 +1,4 @@
-"""Loader that loads image files."""
+"""Loads image files."""
 from typing import List
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
--- a/langchain/document_loaders/mastodon.py
+++ b/langchain/document_loaders/mastodon.py
@ -37,13 +37,13 @@ class MastodonTootsLoader(BaseLoader):
        Args:
            mastodon_accounts: The list of Mastodon accounts to query.
-            number_toots: How many toots to pull for each account. Default is 100.
+            number_toots: How many toots to pull for each account. Defaults to 100.
            exclude_replies: Whether to exclude reply toots from the load.
-                Default is False.
+                Defaults to False.
            access_token: An access token if toots are loaded as a Mastodon app. Can
                also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
            api_base_url: A Mastodon API base URL to talk to, if not using the default.
-                Default is "https://mastodon.social".
+                Defaults to "https://mastodon.social".
        """
        mastodon = _dependable_mastodon_import()
        access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
--- a/langchain/document_loaders/mhtml.py
+++ b/langchain/document_loaders/mhtml.py
@ -24,10 +24,11 @@ class MHTMLLoader(BaseLoader):
        to pass to the BeautifulSoup object.
        Args:
-            file_path: The path to the file to load.
+            file_path: Path to file to load.
            open_encoding: The encoding to use when opening the file.
-            bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
+            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
-            get_text_separator: The separator to use when getting text from the soup.
+            get_text_separator: The separator to use when getting the text
                from the soup.
        """
        try:
            import bs4  # noqa:F401
--- a/langchain/document_loaders/modern_treasury.py
+++ b/langchain/document_loaders/modern_treasury.py
@ -35,6 +35,16 @@ class ModernTreasuryLoader(BaseLoader):
        organization_id: Optional[str] = None,
        api_key: Optional[str] = None,
    ) -> None:
        """
        Args:
            resource: The Modern Treasury resource to load.
            organization_id: The Modern Treasury organization ID. It can also be
               specified via the environment variable
               "MODERN_TREASURY_ORGANIZATION_ID".
            api_key: The Modern Treasury API key. It can also be specified via
               the environment variable "MODERN_TREASURY_API_KEY".
        """
        self.resource = resource
        organization_id = organization_id or get_from_env(
            "organization_id", "MODERN_TREASURY_ORGANIZATION_ID"
--- a/langchain/document_loaders/notebook.py
+++ b/langchain/document_loaders/notebook.py
@ -1,4 +1,4 @@
-"""Loader that loads .ipynb notebook files."""
+"""Loads .ipynb notebook files."""
 import json
 from pathlib import Path
 from typing import Any, List
@ -10,7 +10,18 @@ from langchain.document_loaders.base import BaseLoader
 def concatenate_cells(
    cell: dict, include_outputs: bool, max_output_length: int, traceback: bool
 ) -> str:
-    """Combine cells information in a readable format ready to be used."""
+    """Combine cells information in a readable format ready to be used.
    Args:
        cell: A dictionary
        include_outputs: Whether to include the outputs of the cell.
        max_output_length: Maximum length of the output to be displayed.
        traceback: Whether to return a traceback of the error.
    Returns:
        A string with the cell information.
    """
    cell_type = cell["cell_type"]
    source = cell["source"]
    output = cell["outputs"]
@ -45,7 +56,7 @@ def concatenate_cells(
 def remove_newlines(x: Any) -> Any:
-    """Remove recursively newlines, no matter the data structure they are stored in."""
+    """Recursively removes newlines, no matter the data structure they are stored in."""
    import pandas as pd
    if isinstance(x, str):
@ -59,7 +70,7 @@ def remove_newlines(x: Any) -> Any:
 class NotebookLoader(BaseLoader):
-    """Loader that loads .ipynb notebook files."""
+    """Loads .ipynb notebook files."""
    def __init__(
        self,
@ -69,7 +80,19 @@ class NotebookLoader(BaseLoader):
        remove_newline: bool = False,
        traceback: bool = False,
    ):
-        """Initialize with path."""
+        """Initialize with path.
        Args:
            path: The path to load the notebook from.
            include_outputs: Whether to include the outputs of the cell.
                Defaults to False.
            max_output_length: Maximum length of the output to be displayed.
                Defaults to 10.
            remove_newline: Whether to remove newlines from the notebook.
                Defaults to False.
            traceback: Whether to return a traceback of the error.
                Defaults to False.
        """
        self.file_path = path
        self.include_outputs = include_outputs
        self.max_output_length = max_output_length
--- a/langchain/document_loaders/notion.py
+++ b/langchain/document_loaders/notion.py
@ -1,4 +1,4 @@
-"""Loader that loads Notion directory dump."""
+"""Loads Notion directory dump."""
 from pathlib import Path
 from typing import List
@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader
 class NotionDirectoryLoader(BaseLoader):
-    """Loader that loads Notion directory dump."""
+    """Loads Notion directory dump."""
    def __init__(self, path: str):
-        """Initialize with path."""
+        """Initialize with a file path."""
        self.file_path = path
    def load(self) -> List[Document]:
--- a/langchain/document_loaders/notiondb.py
+++ b/langchain/document_loaders/notiondb.py
@ -15,11 +15,12 @@ BLOCK_URL = NOTION_BASE_URL + "/blocks/{block_id}/children"
 class NotionDBLoader(BaseLoader):
    """Notion DB Loader.
-    Reads content from pages within a Noton Database.
+    Reads content from pages within a Notion Database.
    Args:
        integration_token (str): Notion integration token.
        database_id (str): Notion database id.
        request_timeout_sec (int): Timeout for Notion requests in seconds.
            Defaults to 10.
    """
    def __init__(
@ -75,7 +76,11 @@ class NotionDBLoader(BaseLoader):
        return pages
    def load_page(self, page_summary: Dict[str, Any]) -> Document:
-        """Read a page."""
+        """Read a page.
        Args:
            page_summary: Page summary from Notion API.
        """
        page_id = page_summary["id"]
        # load properties as metadata
--- a/langchain/document_loaders/obsidian.py
+++ b/langchain/document_loaders/obsidian.py
@ -1,4 +1,4 @@
-"""Loader that loads Obsidian directory dump."""
+"""Loads Obsidian directory dump."""
 import re
 from pathlib import Path
 from typing import List
@ -8,14 +8,21 @@ from langchain.document_loaders.base import BaseLoader
 class ObsidianLoader(BaseLoader):
-    """Loader that loads Obsidian files from disk."""
+    """Loads Obsidian files from disk."""
    FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
    def __init__(
        self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
    ):
-        """Initialize with path."""
+        """Initialize with a path.
        Args:
            path: Path to the directory containing the Obsidian files.
            encoding: Charset encoding, defaults to "UTF-8"
            collect_metadata: Whether to collect metadata from the front matter.
                Defaults to True.
        """
        self.file_path = path
        self.encoding = encoding
        self.collect_metadata = collect_metadata
--- a/langchain/document_loaders/odt.py
+++ b/langchain/document_loaders/odt.py
@ -1,4 +1,4 @@
-"""Loader that loads Open Office ODT files."""
+"""Loads OpenOffice ODT files."""
 from typing import Any, List
 from langchain.document_loaders.unstructured import (
@ -8,11 +8,19 @@ from langchain.document_loaders.unstructured import (
 class UnstructuredODTLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load open office ODT files."""
+    """Loader that uses unstructured to load OpenOffice ODT files."""
    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
    ):
        """
        Args:
            file_path: The path to the file to load.
            mode: The mode to use when loading the file. Can be one of "single",
                "multi", or "all". Default is "single".
            **unstructured_kwargs: Any kwargs to pass to the unstructured.
        """
        validate_unstructured_version(min_unstructured_version="0.6.3")
        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
--- a/langchain/document_loaders/onedrive.py
+++ b/langchain/document_loaders/onedrive.py
@ -1,4 +1,4 @@
-"""Loader that loads data from OneDrive"""
+"""Loads data from OneDrive"""
 from __future__ import annotations
 import logging
@ -60,11 +60,18 @@ class _SupportedFileTypes(BaseModel):
 class OneDriveLoader(BaseLoader, BaseModel):
    """Loads data from OneDrive."""
    settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
    """ The settings for the OneDrive API client."""
    drive_id: str = Field(...)
    """ The ID of the OneDrive drive to load data from."""
    folder_path: Optional[str] = None
    """ The path to the folder to load data from."""
    object_ids: Optional[List[str]] = None
    """ The IDs of the objects to load data from."""
    auth_with_token: bool = False
    """ Whether to authenticate with a token or not. Defaults to False."""
    def _auth(self) -> Type[Account]:
        """
--- a/langchain/document_loaders/onedrive_file.py
+++ b/langchain/document_loaders/onedrive_file.py
@ -16,10 +16,15 @@ CHUNK_SIZE = 1024 * 1024 * 5
 class OneDriveFileLoader(BaseLoader, BaseModel):
    """Loads a file from OneDrive."""
    file: File = Field(...)
    """The file to load."""
    class Config:
        arbitrary_types_allowed = True
        """Allow arbitrary types. This is needed for the File type. Default is True.
         See https://pydantic-docs.helpmanual.io/usage/types/#arbitrary-types-allowed"""
    def load(self) -> List[Document]:
        """Load Documents"""
--- a/langchain/document_loaders/open_city_data.py
+++ b/langchain/document_loaders/open_city_data.py
@ -5,13 +5,19 @@ from langchain.document_loaders.base import BaseLoader
 class OpenCityDataLoader(BaseLoader):
-    """Loader that loads Open city data."""
+    """Loads Open City data."""
    def __init__(self, city_id: str, dataset_id: str, limit: int):
-        """Initialize with dataset_id"""
+        """Initialize with dataset_id.
-        """ Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6 """
+        Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6
-        """ e.g., city_id = data.sfgov.org """
+        e.g., city_id = data.sfgov.org
-        """ e.g., dataset_id = vw6y-z8j6 """
+        e.g., dataset_id = vw6y-z8j6
        Args:
            city_id: The Open City city identifier.
            dataset_id: The Open City dataset identifier.
            limit: The maximum number of documents to load.
        """
        self.city_id = city_id
        self.dataset_id = dataset_id
        self.limit = limit
--- a/langchain/document_loaders/org_mode.py
+++ b/langchain/document_loaders/org_mode.py
@ -1,4 +1,4 @@
-"""Loader that loads Org-Mode files."""
+"""Loads Org-Mode files."""
 from typing import Any, List
 from langchain.document_loaders.unstructured import (
@ -13,6 +13,14 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
    ):
        """
        Args:
            file_path: The path to the file to load.
            mode: The mode to load the file from. Default is "single".
            **unstructured_kwargs: Any additional keyword arguments to pass
                to the unstructured.
        """
        validate_unstructured_version(min_unstructured_version="0.7.9")
        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
--- a/langchain/document_loaders/pdf.py
+++ b/langchain/document_loaders/pdf.py
@ -1,4 +1,4 @@
-"""Loader that loads PDF files."""
+"""Loads PDF files."""
 import json
 import logging
 import os
@ -41,11 +41,11 @@ class BasePDFLoader(BaseLoader, ABC):
    """Base loader class for PDF files.
    Defaults to check for local file, but if the file is a web path, it will download it
-    to a temporary file, and use that, then clean up the temporary file after completion
+    to a temporary file, use it, then clean up the temporary file after completion
    """
    def __init__(self, file_path: str):
-        """Initialize with file path."""
+        """Initialize with a file path."""
        self.file_path = file_path
        self.web_path = None
        if "~" in self.file_path:
@ -86,7 +86,7 @@ class BasePDFLoader(BaseLoader, ABC):
 class OnlinePDFLoader(BasePDFLoader):
-    """Loader that loads online PDFs."""
+    """Loads online PDFs."""
    def load(self) -> List[Document]:
        """Load documents."""
@ -97,13 +97,13 @@ class OnlinePDFLoader(BasePDFLoader):
 class PyPDFLoader(BasePDFLoader):
    """Loads a PDF with pypdf and chunks at character level.
-    Loader also stores page numbers in metadatas.
+    Loader also stores page numbers in metadata.
    """
    def __init__(
        self, file_path: str, password: Optional[Union[str, bytes]] = None
    ) -> None:
-        """Initialize with file path."""
+        """Initialize with a file path."""
        try:
            import pypdf  # noqa:F401
        except ImportError:
@ -129,7 +129,7 @@ class PyPDFium2Loader(BasePDFLoader):
    """Loads a PDF with pypdfium2 and chunks at character level."""
    def __init__(self, file_path: str):
-        """Initialize with file path."""
+        """Initialize with a file path."""
        super().__init__(file_path)
        self.parser = PyPDFium2Parser()
@ -148,7 +148,7 @@ class PyPDFium2Loader(BasePDFLoader):
 class PyPDFDirectoryLoader(BaseLoader):
    """Loads a directory with PDF files with pypdf and chunks at character level.
-    Loader also stores page numbers in metadatas.
+    Loader also stores page numbers in metadata.
    """
    def __init__(
@ -222,7 +222,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
    """Loader that uses PDFMiner to load PDF files as HTML content."""
    def __init__(self, file_path: str):
-        """Initialize with file path."""
+        """Initialize with a file path."""
        try:
            from pdfminer.high_level import extract_text_to_fp  # noqa:F401
        except ImportError:
@ -256,7 +256,7 @@ class PyMuPDFLoader(BasePDFLoader):
    """Loader that uses PyMuPDF to load PDF files."""
    def __init__(self, file_path: str) -> None:
-        """Initialize with file path."""
+        """Initialize with a file path."""
        try:
            import fitz  # noqa:F401
        except ImportError:
@ -278,6 +278,8 @@ class PyMuPDFLoader(BasePDFLoader):
 # MathpixPDFLoader implementation taken largely from Daniel Gross's:
 # https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
 class MathpixPDFLoader(BasePDFLoader):
    """This class uses Mathpix service to load PDF files."""
    def __init__(
        self,
        file_path: str,
@ -286,6 +288,16 @@ class MathpixPDFLoader(BasePDFLoader):
        should_clean_pdf: bool = False,
        **kwargs: Any,
    ) -> None:
        """Initialize with a file path.
        Args:
            file_path: a file for loading.
            processed_file_format: a format of the processed file. Default is "mmd".
            max_wait_time_seconds: a maximum time to wait for the response from
             the server. Default is 500.
            should_clean_pdf: a flag to clean the PDF file. Default is False.
            **kwargs: additional keyword arguments.
        """
        super().__init__(file_path)
        self.mathpix_api_key = get_from_dict_or_env(
            kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
@ -324,6 +336,13 @@ class MathpixPDFLoader(BasePDFLoader):
            raise ValueError("Unable to send PDF to Mathpix.")
    def wait_for_processing(self, pdf_id: str) -> None:
        """Wait for processing to complete.
        Args:
            pdf_id: a PDF id.
        Returns: None
        """
        url = self.url + "/" + pdf_id
        for _ in range(0, self.max_wait_time_seconds, 5):
            response = requests.get(url, headers=self.headers)
@ -346,6 +365,14 @@ class MathpixPDFLoader(BasePDFLoader):
        return response.content.decode("utf-8")
    def clean_pdf(self, contents: str) -> str:
        """Clean the PDF file.
        Args:
            contents: a PDF file contents.
        Returns:
        """
        contents = "\n".join(
            [line for line in contents.split("\n") if not line.startswith("![]")]
        )
@ -375,7 +402,7 @@ class PDFPlumberLoader(BasePDFLoader):
    def __init__(
        self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
    ) -> None:
-        """Initialize with file path."""
+        """Initialize with a file path."""
        try:
            import pdfplumber  # noqa:F401
        except ImportError:
--- a/langchain/document_loaders/powerpoint.py
+++ b/langchain/document_loaders/powerpoint.py
@ -1,4 +1,4 @@
-"""Loader that loads powerpoint files."""
+"""Loads PowerPoint files."""
 import os
 from typing import List
@ -6,7 +6,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
 class UnstructuredPowerPointLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load powerpoint files."""
+    """Loader that uses unstructured to load PowerPoint files."""
    def _get_elements(self) -> List:
        from unstructured.__version__ import __version__ as __unstructured_version__
--- a/langchain/document_loaders/psychic.py
+++ b/langchain/document_loaders/psychic.py
@ -1,4 +1,4 @@
-"""Loader that loads documents from Psychic.dev."""
+"""Loads documents from Psychic.dev."""
 from typing import List, Optional
 from langchain.docstore.document import Document
@ -6,12 +6,18 @@ from langchain.document_loaders.base import BaseLoader
 class PsychicLoader(BaseLoader):
-    """Loader that loads documents from Psychic.dev."""
+    """Loads documents from Psychic.dev."""
    def __init__(
        self, api_key: str, account_id: str, connector_id: Optional[str] = None
    ):
-        """Initialize with API key, connector id, and account id."""
+        """Initialize with API key, connector id, and account id.
        Args:
            api_key: The Psychic API key.
            account_id: The Psychic account id.
            connector_id: The Psychic connector id.
        """
        try:
            from psychicapi import ConnectorId, Psychic  # noqa: F401
--- a/langchain/document_loaders/pyspark_dataframe.py
+++ b/langchain/document_loaders/pyspark_dataframe.py
@ -23,7 +23,15 @@ class PySparkDataFrameLoader(BaseLoader):
        page_content_column: str = "text",
        fraction_of_memory: float = 0.1,
    ):
-        """Initialize with a Spark DataFrame object."""
+        """Initialize with a Spark DataFrame object.
        Args:
            spark_session: The SparkSession object.
            df: The Spark DataFrame object.
            page_content_column: The name of the column containing the page content.
             Defaults to "text".
            fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
        """
        try:
            from pyspark.sql import DataFrame, SparkSession
        except ImportError:
@ -48,7 +56,7 @@ class PySparkDataFrameLoader(BaseLoader):
        self.column_names = self.df.columns
    def get_num_rows(self) -> Tuple[int, int]:
-        """Gets the amount of "feasible" rows for the DataFrame"""
+        """Gets the number of "feasible" rows for the DataFrame"""
        try:
            import psutil
        except ImportError as e:
--- a/langchain/document_loaders/python.py
+++ b/langchain/document_loaders/python.py
@ -9,6 +9,11 @@ class PythonLoader(TextLoader):
    """
    def __init__(self, file_path: str):
        """Initialize with a file path.
        Args:
            file_path: The path to the file to load.
        """
        with open(file_path, "rb") as f:
            encoding, _ = tokenize.detect_encoding(f.readline)
        super().__init__(file_path=file_path, encoding=encoding)
--- a/langchain/document_loaders/readthedocs.py
+++ b/langchain/document_loaders/readthedocs.py
@ -1,4 +1,4 @@
-"""Loader that loads ReadTheDocs documentation directory dump."""
+"""Loads ReadTheDocs documentation directory dump."""
 from pathlib import Path
 from typing import Any, List, Optional, Tuple, Union
@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
 class ReadTheDocsLoader(BaseLoader):
-    """Loader that loads ReadTheDocs documentation directory dump."""
+    """Loads ReadTheDocs documentation directory dump."""
    def __init__(
        self,
@ -20,7 +20,7 @@ class ReadTheDocsLoader(BaseLoader):
        """
        Initialize ReadTheDocsLoader
-        The loader loops over all files under `path` and extract the actual content of
+        The loader loops over all files under `path` and extracts the actual content of
        the files by retrieving main html tags. Default main html tags include
        `<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
        can also define your own html tags by passing custom_html_tag, e.g.
@ -31,7 +31,7 @@ class ReadTheDocsLoader(BaseLoader):
        Args:
            path: The location of pulled readthedocs folder.
            encoding: The encoding with which to open the documents.
-            errors: Specifies how encoding and decoding errors are to be handled—this
+            errors: Specify how encoding and decoding errors are to be handled—this
                cannot be used in binary mode.
            custom_html_tag: Optional custom html tag to retrieve the content from
                files.
--- a/langchain/document_loaders/recursive_url_loader.py
+++ b/langchain/document_loaders/recursive_url_loader.py
@ -8,17 +8,27 @@ from langchain.document_loaders.base import BaseLoader
 class RecursiveUrlLoader(BaseLoader):
-    """Loader that loads all child links from a given url."""
+    """Loads all child links from a given url."""
    def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
-        """Initialize with URL to crawl and any sub-directories to exclude."""
+        """Initialize with URL to crawl and any subdirectories to exclude.
        Args:
            url: The URL to crawl.
            exclude_dirs: A list of subdirectories to exclude.
        """
        self.url = url
        self.exclude_dirs = exclude_dirs
    def get_child_links_recursive(
        self, url: str, visited: Optional[Set[str]] = None
    ) -> Set[str]:
-        """Recursively get all child links starting with the path of the input URL."""
+        """Recursively get all child links starting with the path of the input URL.
        Args:
            url: The URL to crawl.
            visited: A set of visited URLs.
        """
        try:
            from bs4 import BeautifulSoup
@ -39,7 +49,7 @@ class RecursiveUrlLoader(BaseLoader):
        if not parent_url.endswith("/"):
            parent_url += "/"
-        # Exclude the root and parent from list
+        # Exclude the root and parent from a list
        visited = set() if visited is None else visited
        # Exclude the links that start with any of the excluded directories
--- a/langchain/document_loaders/reddit.py
+++ b/langchain/document_loaders/reddit.py
@ -23,7 +23,7 @@ def _dependable_praw_import() -> praw:
 class RedditPostsLoader(BaseLoader):
    """Reddit posts loader.
    Read posts on a subreddit.
-    First you need to go to
+    First, you need to go to
    https://www.reddit.com/prefs/apps/
    and create your application
    """
@ -38,6 +38,20 @@ class RedditPostsLoader(BaseLoader):
        categories: Sequence[str] = ["new"],
        number_posts: Optional[int] = 10,
    ):
        """
        Initialize with client_id, client_secret, user_agent, search_queries, mode,
            categories, number_posts.
        Example: https://www.reddit.com/r/learnpython/
        Args:
            client_id: Reddit client id.
            client_secret: Reddit client secret.
            user_agent: Reddit user agent.
            search_queries: The search queries.
            mode: The mode.
            categories: The categories. Default: ["new"]
            number_posts: The number of posts. Default: 10
        """
        self.client_id = client_id
        self.client_secret = client_secret
        self.user_agent = user_agent
--- a/langchain/document_loaders/roam.py
+++ b/langchain/document_loaders/roam.py
@ -1,4 +1,4 @@
-"""Loader that loads Roam directory dump."""
+"""Loads Roam directory dump."""
 from pathlib import Path
 from typing import List
@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader
 class RoamLoader(BaseLoader):
-    """Loader that loads Roam files from disk."""
+    """Loads Roam files from disk."""
    def __init__(self, path: str):
-        """Initialize with path."""
+        """Initialize with a path."""
        self.file_path = path
    def load(self) -> List[Document]:
--- a/langchain/document_loaders/rst.py
+++ b/langchain/document_loaders/rst.py
@ -1,4 +1,4 @@
-"""Loader that loads RST files."""
+"""Loads RST files."""
 from typing import Any, List
 from langchain.document_loaders.unstructured import (
@ -13,6 +13,16 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
    ):
        """
        Initialize with a file path.
        Args:
            file_path: The path to the file to load.
            mode: The mode to use for partitioning. See unstructured for details.
                Defaults to "single".
            **unstructured_kwargs: Additional keyword arguments to pass
                to unstructured.
        """
        validate_unstructured_version(min_unstructured_version="0.7.5")
        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
--- a/langchain/document_loaders/rtf.py
+++ b/langchain/document_loaders/rtf.py
@ -1,4 +1,4 @@
-"""Loader that loads rich text files."""
+"""Loads rich text files."""
 from typing import Any, List
 from langchain.document_loaders.unstructured import (
@ -13,6 +13,16 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
    ):
        """
        Initialize with a file path.
        Args:
            file_path: The path to the file to load.
            mode: The mode to use for partitioning. See unstructured for details.
                Defaults to "single".
            **unstructured_kwargs: Additional keyword arguments to pass
                to unstructured.
        """
        min_unstructured_version = "0.5.12"
        if not satisfies_min_unstructured_version(min_unstructured_version):
            raise ValueError(
--- a/langchain/document_loaders/s3_directory.py
+++ b/langchain/document_loaders/s3_directory.py
@ -1,4 +1,4 @@
-"""Loading logic for loading documents from an s3 directory."""
+"""Loading logic for loading documents from an AWS S3 directory."""
 from typing import List
 from langchain.docstore.document import Document
@ -7,10 +7,15 @@ from langchain.document_loaders.s3_file import S3FileLoader
 class S3DirectoryLoader(BaseLoader):
-    """Loading logic for loading documents from s3."""
+    """Loading logic for loading documents from an AWS S3."""
    def __init__(self, bucket: str, prefix: str = ""):
-        """Initialize with bucket and key name."""
+        """Initialize with bucket and key name.
        Args:
            bucket: The name of the S3 bucket.
            prefix: The prefix of the S3 key. Defaults to "".
        """
        self.bucket = bucket
        self.prefix = prefix
--- a/langchain/document_loaders/s3_file.py
+++ b/langchain/document_loaders/s3_file.py
@ -1,4 +1,4 @@
-"""Loading logic for loading documents from an s3 file."""
+"""Loading logic for loading documents from an AWS S3 file."""
 import os
 import tempfile
 from typing import List
@ -9,10 +9,15 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
 class S3FileLoader(BaseLoader):
-    """Loading logic for loading documents from s3."""
+    """Loading logic for loading documents from an AWS S3 file."""
    def __init__(self, bucket: str, key: str):
-        """Initialize with bucket and key name."""
+        """Initialize with bucket and key name.
        Args:
            bucket: The name of the S3 bucket.
            key: The key of the S3 object.
        """
        self.bucket = bucket
        self.key = key
--- a/langchain/document_loaders/sitemap.py
+++ b/langchain/document_loaders/sitemap.py
@ -42,11 +42,12 @@ class SitemapLoader(WebBaseLoader):
                urls that are parsed and loaded
            parsing_function: Function to parse bs4.Soup output
            blocksize: number of sitemap locations per block
-            blocknum: the number of the block that should be loaded - zero indexed
+            blocknum: the number of the block that should be loaded - zero indexed.
                Default: 0
            meta_function: Function to parse bs4.Soup output for metadata
                remember when setting this method to also copy metadata["loc"]
                to metadata["source"] if you are using this field
-            is_local: whether the sitemap is a local file
+            is_local: whether the sitemap is a local file. Default: False
        """
        if blocksize is not None and blocksize < 1:
@ -72,7 +73,14 @@ class SitemapLoader(WebBaseLoader):
        self.is_local = is_local
    def parse_sitemap(self, soup: Any) -> List[dict]:
-        """Parse sitemap xml and load into a list of dicts."""
+        """Parse sitemap xml and load into a list of dicts.
        Args:
            soup: BeautifulSoup object.
        Returns:
            List of dicts.
        """
        els = []
        for url in soup.find_all("url"):
            loc = url.find("loc")
--- a/langchain/document_loaders/slack_directory.py
+++ b/langchain/document_loaders/slack_directory.py
@ -9,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader
 class SlackDirectoryLoader(BaseLoader):
-    """Loader for loading documents from a Slack directory dump."""
+    """Loads documents from a Slack directory dump."""
    def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
        """Initialize the SlackDirectoryLoader.
--- a/langchain/document_loaders/snowflake_loader.py
+++ b/langchain/document_loaders/snowflake_loader.py
@ -41,6 +41,7 @@ class SnowflakeLoader(BaseLoader):
            role: Snowflake role.
            database: Snowflake database
            schema: Snowflake schema
            parameters: Optional. Parameters to pass to the query.
            page_content_columns: Optional. Columns written to Document `page_content`.
            metadata_columns: Optional. Columns written to Document `metadata`.
        """
@ -62,7 +63,7 @@ class SnowflakeLoader(BaseLoader):
        try:
            import snowflake.connector
        except ImportError as ex:
-            raise ValueError(
+            raise ImportError(
                "Could not import snowflake-connector-python package. "
                "Please install it with `pip install snowflake-connector-python`."
            ) from ex
--- a/langchain/document_loaders/spreedly.py
+++ b/langchain/document_loaders/spreedly.py
@ -23,6 +23,12 @@ class SpreedlyLoader(BaseLoader):
    """Loader that fetches data from Spreedly API."""
    def __init__(self, access_token: str, resource: str) -> None:
        """Initialize with an access token and a resource.
        Args:
            access_token: The access token.
            resource: The resource.
        """
        self.access_token = access_token
        self.resource = resource
        self.headers = {
--- a/langchain/document_loaders/srt.py
+++ b/langchain/document_loaders/srt.py
@ -9,7 +9,7 @@ class SRTLoader(BaseLoader):
    """Loader for .srt (subtitle) files."""
    def __init__(self, file_path: str):
-        """Initialize with file path."""
+        """Initialize with a file path."""
        try:
            import pysrt  # noqa:F401
        except ImportError:
--- a/langchain/document_loaders/stripe.py
+++ b/langchain/document_loaders/stripe.py
@ -21,6 +21,12 @@ class StripeLoader(BaseLoader):
    """Loader that fetches data from Stripe."""
    def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
        """Initialize with a resource and an access token.
        Args:
            resource: The resource.
            access_token: The access token.
        """
        self.resource = resource
        access_token = access_token or get_from_env(
            "access_token", "STRIPE_ACCESS_TOKEN"
--- a/langchain/document_loaders/telegram.py
+++ b/langchain/document_loaders/telegram.py
@ -1,4 +1,4 @@
-"""Loader that loads Telegram chat json dump."""
+"""Loads Telegram chat json dump."""
 from __future__ import annotations
 import asyncio
@ -24,10 +24,10 @@ def concatenate_rows(row: dict) -> str:
 class TelegramChatFileLoader(BaseLoader):
-    """Loader that loads Telegram chat json directory dump."""
+    """Loads Telegram chat json directory dump."""
    def __init__(self, path: str):
-        """Initialize with path."""
+        """Initialize with a path."""
        self.file_path = path
    def load(self) -> List[Document]:
@ -79,7 +79,7 @@ def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
 class TelegramChatApiLoader(BaseLoader):
-    """Loader that loads Telegram chat json directory dump."""
+    """Loads Telegram chat json directory dump."""
    def __init__(
        self,
@ -89,7 +89,16 @@ class TelegramChatApiLoader(BaseLoader):
        username: Optional[str] = None,
        file_path: str = "telegram_data.json",
    ):
-        """Initialize with API parameters."""
+        """Initialize with API parameters.
        Args:
            chat_entity: The chat entity to fetch data from.
            api_id: The API ID.
            api_hash: The API hash.
            username: The username.
            file_path: The file path to save the data to. Defaults to
                 "telegram_data.json".
        """
        self.chat_entity = chat_entity
        self.api_id = api_id
        self.api_hash = api_hash
--- a/langchain/document_loaders/tomarkdown.py
+++ b/langchain/document_loaders/tomarkdown.py
@ -1,4 +1,4 @@
-"""Loader that loads HTML to markdown using 2markdown."""
+"""Loads HTML to markdown using 2markdown."""
 from __future__ import annotations
 from typing import Iterator, List
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
 class ToMarkdownLoader(BaseLoader):
-    """Loader that loads HTML to markdown using 2markdown."""
+    """Loads HTML to markdown using 2markdown."""
    def __init__(self, url: str, api_key: str):
        """Initialize with url and api key."""
--- a/langchain/document_loaders/trello.py
+++ b/langchain/document_loaders/trello.py
@ -1,4 +1,4 @@
-"""Loader that loads cards from Trello"""
+"""Loads cards from Trello"""
 from __future__ import annotations
 from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple
--- a/langchain/document_loaders/whatsapp_chat.py
+++ b/langchain/document_loaders/whatsapp_chat.py
@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str:
 class WhatsAppChatLoader(BaseLoader):
-    """Loader that loads WhatsApp messages text file."""
+    """Loads WhatsApp messages text file."""
    def __init__(self, path: str):
        """Initialize with path."""
--- a/langchain/document_loaders/word_document.py
+++ b/langchain/document_loaders/word_document.py
@ -1,4 +1,4 @@
-"""Loader that loads word documents."""
+"""Loads word documents."""
 import os
 import tempfile
 from abc import ABC
--- a/langchain/document_loaders/xml.py
+++ b/langchain/document_loaders/xml.py
@ -1,4 +1,4 @@
-"""Loader that loads Microsoft Excel files."""
+"""Loads Microsoft Excel files."""
 from typing import Any, List
 from langchain.document_loaders.unstructured import (
--- a/langchain/document_loaders/youtube.py
+++ b/langchain/document_loaders/youtube.py
@ -1,4 +1,4 @@
-"""Loader that loads YouTube transcript."""
+"""Loads YouTube transcript."""
 from __future__ import annotations
 import logging
@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]:
 class YoutubeLoader(BaseLoader):
-    """Loader that loads Youtube transcripts."""
+    """Loads Youtube transcripts."""
    def __init__(
        self,
@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader):
@dataclass
 class GoogleApiYoutubeLoader(BaseLoader):
-    """Loader that loads all Videos from a Channel
+    """Loads all Videos from a Channel
    To use, you should have the ``googleapiclient,youtube_transcript_api``
    python package installed.