docstrings document_loaders 2 (#6890)

updated docstring for the `document_loaders` Maintainer responsibilities: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
2025-08-11 05:45:01 +00:00 · 2023-07-02 12:14:22 -07:00 · 2023-07-02 12:14:22 -07:00 · 1feac83323
commit 1feac83323
parent 77ae8084a0
38 changed files with 322 additions and 74 deletions
--- a/langchain/document_loaders/college_confidential.py
+++ b/langchain/document_loaders/college_confidential.py
@ -9,7 +9,7 @@ class CollegeConfidentialLoader(WebBaseLoader):
    """Loader that loads College Confidential webpages."""

    def load(self) -> List[Document]:
-        """Load webpage."""
+        """Load webpages as Documents."""
        soup = self.scrape()
        text = soup.select_one("main[class='skin-handler']").text
        metadata = {"source": self.web_path}
--- a/langchain/document_loaders/confluence.py
+++ b/langchain/document_loaders/confluence.py
@ -33,8 +33,9 @@ class ContentFormat(str, Enum):


 class ConfluenceLoader(BaseLoader):
-    """
-    Load Confluence pages. Port of https://llamahub.ai/l/confluence
+    """Load Confluence pages.
+
+    Port of https://llamahub.ai/l/confluence
    This currently supports username/api_key, Oauth2 login or personal access token
    authentication.

@ -175,7 +176,7 @@ class ConfluenceLoader(BaseLoader):
            "key_cert",
        ]:
            errors.append(
-                "You have either ommited require keys or added extra "
+                "You have either omitted require keys or added extra "
                "keys to the oauth2 dictionary. key values should be "
                "`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`"
            )
@ -343,7 +344,7 @@ class ConfluenceLoader(BaseLoader):
        doesn't match the limit value. If `limit` is >100 confluence
        seems to cap the response to 100. Also, due to the Atlassian Python
        package, we don't get the "next" values from the "_links" key because
-        they only return the value from the results key. So here, the pagination
+        they only return the value from the result key. So here, the pagination
        starts from 0 and goes until the max_pages, getting the `limit` number
        of pages with each request. We have to manually check if there
        are more docs based on the length of the returned list of pages, rather than
--- a/langchain/document_loaders/conllu.py
+++ b/langchain/document_loaders/conllu.py
@ -10,11 +10,11 @@ class CoNLLULoader(BaseLoader):
    """Load CoNLL-U files."""

    def __init__(self, file_path: str):
-        """Initialize with file path."""
+        """Initialize with a file path."""
        self.file_path = file_path

    def load(self) -> List[Document]:
-        """Load from file path."""
+        """Load from a file path."""
        with open(self.file_path, encoding="utf8") as f:
            tsv = list(csv.reader(f, delimiter="\t"))

--- a/langchain/document_loaders/csv_loader.py
+++ b/langchain/document_loaders/csv_loader.py
@ -37,6 +37,16 @@ class CSVLoader(BaseLoader):
        csv_args: Optional[Dict] = None,
        encoding: Optional[str] = None,
    ):
+        """
+
+        Args:
+            file_path: The path to the CSV file.
+            source_column: The name of the column in the CSV file to use as the source.
+              Optional. Defaults to None.
+            csv_args: A dictionary of arguments to pass to the csv.DictReader.
+              Optional. Defaults to None.
+            encoding: The encoding of the CSV file. Optional. Defaults to None.
+        """
        self.file_path = file_path
        self.source_column = source_column
        self.encoding = encoding
@ -73,6 +83,14 @@ class UnstructuredCSVLoader(UnstructuredFileLoader):
    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
    ):
+        """
+
+        Args:
+            file_path: The path to the CSV file.
+            mode: The mode to use when loading the CSV file.
+              Optional. Defaults to "single".
+            **unstructured_kwargs: Keyword arguments to pass to unstructured.
+        """
        validate_unstructured_version(min_unstructured_version="0.6.8")
        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

--- a/langchain/document_loaders/dataframe.py
+++ b/langchain/document_loaders/dataframe.py
@ -1,4 +1,4 @@
-"""Load from Dataframe object"""
+"""Load from a Dataframe object"""
 from typing import Any, Iterator, List

 from langchain.docstore.document import Document
@ -6,10 +6,16 @@ from langchain.document_loaders.base import BaseLoader


 class DataFrameLoader(BaseLoader):
-    """Load Pandas DataFrames."""
+    """Load Pandas DataFrame."""

    def __init__(self, data_frame: Any, page_content_column: str = "text"):
-        """Initialize with dataframe object."""
+        """Initialize with dataframe object.
+
+        Args:
+            data_frame: Pandas DataFrame object.
+            page_content_column: Name of the column containing the page content.
+              Defaults to "text".
+        """
        import pandas as pd

        if not isinstance(data_frame, pd.DataFrame):
--- a/langchain/document_loaders/diffbot.py
+++ b/langchain/document_loaders/diffbot.py
@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)


 class DiffbotLoader(BaseLoader):
-    """Loader that loads Diffbot file json."""
+    """Loads Diffbot file json."""

    def __init__(
        self, api_token: str, urls: List[str], continue_on_failure: bool = True
    ):
-        """Initialize with API token, ids, and key."""
+        """Initialize with API token, ids, and key.
+
+        Args:
+            api_token: Diffbot API token.
+            urls: List of URLs to load.
+            continue_on_failure: Whether to continue loading other URLs if one fails.
+               Defaults to True.
+        """
        self.api_token = api_token
        self.urls = urls
        self.continue_on_failure = continue_on_failure
@ -38,7 +45,7 @@ class DiffbotLoader(BaseLoader):
        return response.json() if response.ok else {}

    def load(self) -> List[Document]:
-        """Extract text from Diffbot on all the URLs and return Document instances"""
+        """Extract text from Diffbot on all the URLs and return Documents"""
        docs: List[Document] = list()

        for url in self.urls:
--- a/langchain/document_loaders/directory.py
+++ b/langchain/document_loaders/directory.py
@ -1,4 +1,4 @@
-"""Loading logic for loading documents from a directory."""
+"""Load documents from a directory."""
 import concurrent
 import logging
 from pathlib import Path
@ -25,7 +25,7 @@ def _is_visible(p: Path) -> bool:


 class DirectoryLoader(BaseLoader):
-    """Loading logic for loading documents from a directory."""
+    """Load documents from a directory."""

    def __init__(
        self,
@ -40,7 +40,22 @@ class DirectoryLoader(BaseLoader):
        use_multithreading: bool = False,
        max_concurrency: int = 4,
    ):
-        """Initialize with path to directory and how to glob over it."""
+        """Initialize with a path to directory and how to glob over it.
+
+        Args:
+            path: Path to directory.
+            glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
+               (all files except hidden).
+            silent_errors: Whether to silently ignore errors. Defaults to False.
+            load_hidden: Whether to load hidden files. Defaults to False.
+            loader_cls: Loader class to use for loading files.
+              Defaults to UnstructuredFileLoader.
+            loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None.
+            recursive: Whether to recursively search for files. Defaults to False.
+            show_progress: Whether to show a progress bar. Defaults to False.
+            use_multithreading: Whether to use multithreading. Defaults to False.
+            max_concurrency: The maximum number of threads to use. Defaults to 4.
+        """
        if loader_kwargs is None:
            loader_kwargs = {}
        self.path = path
@ -57,6 +72,14 @@ class DirectoryLoader(BaseLoader):
    def load_file(
        self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
    ) -> None:
+        """Load a file.
+
+        Args:
+            item: File path.
+            path: Directory path.
+            docs: List of documents to append to.
+            pbar: Progress bar. Defaults to None.
+        """
        if item.is_file():
            if _is_visible(item.relative_to(path)) or self.load_hidden:
                try:
--- a/langchain/document_loaders/discord.py
+++ b/langchain/document_loaders/discord.py
@ -14,7 +14,12 @@ class DiscordChatLoader(BaseLoader):
    """Load Discord chat logs."""

    def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"):
-        """Initialize with a Pandas DataFrame containing chat logs."""
+        """Initialize with a Pandas DataFrame containing chat logs.
+
+        Args:
+            chat_log: Pandas DataFrame containing chat logs.
+            user_id_col: Name of the column containing the user ID. Defaults to "ID".
+        """
        if not isinstance(chat_log, pd.DataFrame):
            raise ValueError(
                f"Expected chat_log to be a pd.DataFrame, got {type(chat_log)}"
--- a/langchain/document_loaders/docugami.py
+++ b/langchain/document_loaders/docugami.py
@ -1,4 +1,4 @@
-"""Loader that loads processed documents from Docugami."""
+"""Loads processed documents from Docugami."""

 import io
 import logging
@ -29,22 +29,35 @@ logger = logging.getLogger(__name__)


 class DocugamiLoader(BaseLoader, BaseModel):
-    """Loader that loads processed docs from Docugami.
+    """Loads processed docs from Docugami.

    To use, you should have the ``lxml`` python package installed.
    """

    api: str = DEFAULT_API_ENDPOINT
+    """The Docugami API endpoint to use."""

    access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY")
+    """The Docugami API access token to use."""
    docset_id: Optional[str]
+    """The Docugami API docset ID to use."""
    document_ids: Optional[Sequence[str]]
+    """The Docugami API document IDs to use."""
    file_paths: Optional[Sequence[Union[Path, str]]]
+    """The local file paths to use."""
    min_chunk_size: int = 32  # appended to the next chunk to avoid over-chunking
+    """The minimum chunk size to use when parsing DGML. Defaults to 32."""

    @root_validator
    def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]:
-        """Validate that either local file paths are given, or remote API docset ID."""
+        """Validate that either local file paths are given, or remote API docset ID.
+
+        Args:
+            values: The values to validate.
+
+        Returns:
+            The validated values.
+        """
        if values.get("file_paths") and values.get("docset_id"):
            raise ValueError("Cannot specify both file_paths and remote API docset_id")

--- a/langchain/document_loaders/duckdb_loader.py
+++ b/langchain/document_loaders/duckdb_loader.py
@ -22,6 +22,20 @@ class DuckDBLoader(BaseLoader):
        page_content_columns: Optional[List[str]] = None,
        metadata_columns: Optional[List[str]] = None,
    ):
+        """
+
+        Args:
+            query: The query to execute.
+            database: The database to connect to. Defaults to ":memory:".
+            read_only: Whether to open the database in read-only mode.
+              Defaults to False.
+            config: A dictionary of configuration options to pass to the database.
+              Optional.
+            page_content_columns: The columns to write into the `page_content`
+              of the document. Optional.
+            metadata_columns: The columns to write into the `metadata` of the document.
+              Optional.
+        """
        self.query = query
        self.database = database
        self.read_only = read_only
--- a/langchain/document_loaders/email.py
+++ b/langchain/document_loaders/email.py
@ -1,4 +1,4 @@
-"""Loader that loads email files."""
+"""Loads email files."""
 import os
 from typing import Any, List

@ -72,12 +72,17 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):

 class OutlookMessageLoader(BaseLoader):
    """
-    Loader that loads Outlook Message files using extract_msg.
+    Loads Outlook Message files using extract_msg.
+
    https://github.com/TeamMsgExtractor/msg-extractor
    """

    def __init__(self, file_path: str):
-        """Initialize with file path."""
+        """Initialize with a file path.
+
+        Args:
+            file_path: The path to the Outlook Message file.
+        """

        self.file_path = file_path

--- a/langchain/document_loaders/embaas.py
+++ b/langchain/document_loaders/embaas.py
@ -52,7 +52,10 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):


 class BaseEmbaasLoader(BaseModel):
+    """Base class for embedding a model into an Embaas document extraction API."""
+
    embaas_api_key: Optional[str] = None
+    """The API key for the embaas document extraction API."""
    api_url: str = EMBAAS_DOC_API_URL
    """The URL of the embaas document extraction API."""
    params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
@ -69,7 +72,7 @@ class BaseEmbaasLoader(BaseModel):


 class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
-    """Wrapper around embaas's document byte loader service.
+    """Embaas's document byte loader.

    To use, you should have the
    environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
@ -99,6 +102,11 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
    """

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
+        """Parses the blob lazily.
+
+        Args:
+            blob: The blob to parse.
+        """
        yield from self._get_documents(blob=blob)

    @staticmethod
@ -170,7 +178,7 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):


 class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
-    """Wrapper around embaas's document loader service.
+    """Embaas's document loader.

    To use, you should have the
    environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
--- a/langchain/document_loaders/evernote.py
+++ b/langchain/document_loaders/evernote.py
@ -14,6 +14,7 @@ from langchain.document_loaders.base import BaseLoader

 class EverNoteLoader(BaseLoader):
    """EverNote Loader.
+
    Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
    Instructions on producing this file can be found at
    https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
--- a/langchain/document_loaders/excel.py
+++ b/langchain/document_loaders/excel.py
@ -13,6 +13,14 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
    def __init__(
        self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
    ):
+        """
+
+        Args:
+            file_path: The path to the Microsoft Excel file.
+            mode: The mode to use when partitioning the file. See unstructured docs
+              for more info. Optional. Defaults to "single".
+            **unstructured_kwargs: Keyword arguments to pass to unstructured.
+        """
        validate_unstructured_version(min_unstructured_version="0.6.7")
        super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

--- a/langchain/document_loaders/facebook_chat.py
+++ b/langchain/document_loaders/facebook_chat.py
@ -9,7 +9,11 @@ from langchain.document_loaders.base import BaseLoader


 def concatenate_rows(row: dict) -> str:
-    """Combine message information in a readable format ready to be used."""
+    """Combine message information in a readable format ready to be used.
+
+    Args:
+        row: dictionary containing message information.
+    """
    sender = row["sender_name"]
    text = row["content"]
    date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
@ -19,10 +23,10 @@ def concatenate_rows(row: dict) -> str:


 class FacebookChatLoader(BaseLoader):
-    """Loader that loads Facebook messages json directory dump."""
+    """Loads Facebook messages json directory dump."""

    def __init__(self, path: str):
-        """Initialize with path."""
+        """Initialize with a path."""
        self.file_path = path

    def load(self) -> List[Document]:
--- a/langchain/document_loaders/figma.py
+++ b/langchain/document_loaders/figma.py
@ -9,10 +9,16 @@ from langchain.utils import stringify_dict


 class FigmaFileLoader(BaseLoader):
-    """Loader that loads Figma file json."""
+    """Loads Figma file json."""

    def __init__(self, access_token: str, ids: str, key: str):
-        """Initialize with access token, ids, and key."""
+        """Initialize with access token, ids, and key.
+
+        Args:
+            access_token: The access token for the Figma REST API.
+            ids: The ids of the Figma file.
+            key: The key for the Figma file
+        """
        self.access_token = access_token
        self.ids = ids
        self.key = key
--- a/langchain/document_loaders/gcs_directory.py
+++ b/langchain/document_loaders/gcs_directory.py
@ -7,10 +7,16 @@ from langchain.document_loaders.gcs_file import GCSFileLoader


 class GCSDirectoryLoader(BaseLoader):
-    """Loading logic for loading documents from GCS."""
+    """Loads Documents from GCS."""

    def __init__(self, project_name: str, bucket: str, prefix: str = ""):
-        """Initialize with bucket and key name."""
+        """Initialize with bucket and key name.
+
+        Args:
+            project_name: The name of the project for the GCS bucket.
+            bucket: The name of the GCS bucket.
+            prefix: The prefix of the GCS bucket.
+        """
        self.project_name = project_name
        self.bucket = bucket
        self.prefix = prefix
@ -20,7 +26,7 @@ class GCSDirectoryLoader(BaseLoader):
        try:
            from google.cloud import storage
        except ImportError:
-            raise ValueError(
+            raise ImportError(
                "Could not import google-cloud-storage python package. "
                "Please install it with `pip install google-cloud-storage`."
            )
--- a/langchain/document_loaders/gcs_file.py
+++ b/langchain/document_loaders/gcs_file.py
@ -1,4 +1,4 @@
-"""Loading logic for loading documents from a GCS file."""
+"""Load documents from a GCS file."""
 import os
 import tempfile
 from typing import List
@ -9,10 +9,16 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader


 class GCSFileLoader(BaseLoader):
-    """Loading logic for loading documents from GCS."""
+    """Load Documents from a GCS file."""

    def __init__(self, project_name: str, bucket: str, blob: str):
-        """Initialize with bucket and key name."""
+        """Initialize with bucket and key name.
+
+        Args:
+            project_name: The name of the project to load
+            bucket: The name of the GCS bucket.
+            blob: The name of the GCS blob to load.
+        """
        self.bucket = bucket
        self.blob = blob
        self.project_name = project_name
@ -22,7 +28,7 @@ class GCSFileLoader(BaseLoader):
        try:
            from google.cloud import storage
        except ImportError:
-            raise ValueError(
+            raise ImportError(
                "Could not import google-cloud-storage python package. "
                "Please install it with `pip install google-cloud-storage`."
            )
--- a/langchain/document_loaders/git.py
+++ b/langchain/document_loaders/git.py
@ -7,9 +7,9 @@ from langchain.document_loaders.base import BaseLoader

 class GitLoader(BaseLoader):
    """Loads files from a Git repository into a list of documents.
-    Repository can be local on disk available at `repo_path`,
+    The Repository can be local on disk available at `repo_path`,
    or remote at `clone_url` that will be cloned to `repo_path`.
-    Currently supports only text files.
+    Currently, supports only text files.

    Each document represents one file in the repository. The `path` points to
    the local Git repository, and the `branch` specifies the branch to load
@ -23,6 +23,15 @@ class GitLoader(BaseLoader):
        branch: Optional[str] = "main",
        file_filter: Optional[Callable[[str], bool]] = None,
    ):
+        """
+
+        Args:
+            repo_path: The path to the Git repository.
+            clone_url: Optional. The URL to clone the repository from.
+            branch: Optional. The branch to load files from. Defaults to `main`.
+            file_filter: Optional. A function that takes a file path and returns
+              a boolean indicating whether to load the file. Defaults to None.
+        """
        self.repo_path = repo_path
        self.clone_url = clone_url
        self.branch = branch
--- a/langchain/document_loaders/gitbook.py
+++ b/langchain/document_loaders/gitbook.py
@ -28,7 +28,9 @@ class GitbookLoader(WebBaseLoader):
            load_all_paths: If set to True, all relative paths in the navbar
                are loaded instead of only `web_page`.
            base_url: If `load_all_paths` is True, the relative paths are
-                appended to this base url. Defaults to `web_page` if not set.
+                appended to this base url. Defaults to `web_page`.
+            content_selector: The CSS selector for the content to load.
+                Defaults to "main".
        """
        self.base_url = base_url or web_page
        if self.base_url.endswith("/"):
--- a/langchain/document_loaders/github.py
+++ b/langchain/document_loaders/github.py
@ -35,6 +35,8 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC):


 class GitHubIssuesLoader(BaseGitHubLoader):
+    """Load issues of a GitHub repository."""
+
    include_prs: bool = True
    """If True include Pull Requests in results, otherwise ignore them."""
    milestone: Union[int, Literal["*", "none"], None] = None
@ -159,6 +161,7 @@ class GitHubIssuesLoader(BaseGitHubLoader):

    @property
    def query_params(self) -> str:
+        """Create query parameters for GitHub API."""
        labels = ",".join(self.labels) if self.labels else self.labels
        query_params_dict = {
            "milestone": self.milestone,
@ -179,4 +182,5 @@ class GitHubIssuesLoader(BaseGitHubLoader):

    @property
    def url(self) -> str:
+        """Create URL for GitHub API."""
        return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}"
--- a/langchain/document_loaders/googledrive.py
+++ b/langchain/document_loaders/googledrive.py
@ -22,21 +22,32 @@ SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]


 class GoogleDriveLoader(BaseLoader, BaseModel):
-    """Loader that loads Google Docs from Google Drive."""
+    """Loads Google Docs from Google Drive."""

    service_account_key: Path = Path.home() / ".credentials" / "keys.json"
+    """Path to the service account key file."""
    credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
+    """Path to the credentials file."""
    token_path: Path = Path.home() / ".credentials" / "token.json"
+    """Path to the token file."""
    folder_id: Optional[str] = None
+    """The folder id to load from."""
    document_ids: Optional[List[str]] = None
+    """The document ids to load from."""
    file_ids: Optional[List[str]] = None
+    """The file ids to load from."""
    recursive: bool = False
+    """Whether to load recursively. Only applies when folder_id is given."""
    file_types: Optional[Sequence[str]] = None
+    """The file types to load. Only applies when folder_id is given."""
    load_trashed_files: bool = False
+    """Whether to load trashed files. Only applies when folder_id is given."""
    # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
    # results in pydantic validation errors
    file_loader_cls: Any = None
+    """The file loader class to use."""
    file_loader_kwargs: Dict["str", Any] = {}
+    """The file loader kwargs to use."""

    @root_validator
    def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
--- a/langchain/document_loaders/gutenberg.py
+++ b/langchain/document_loaders/gutenberg.py
@ -1,4 +1,4 @@
-"""Loader that loads .txt web files."""
+"""Loads .txt web files."""
 from typing import List

 from langchain.docstore.document import Document
@ -9,7 +9,7 @@ class GutenbergLoader(BaseLoader):
    """Loader that uses urllib to load .txt web files."""

    def __init__(self, file_path: str):
-        """Initialize with file path."""
+        """Initialize with a file path."""
        if not file_path.startswith("https://www.gutenberg.org"):
            raise ValueError("file path must start with 'https://www.gutenberg.org'")

--- a/langchain/document_loaders/helpers.py
+++ b/langchain/document_loaders/helpers.py
@ -5,9 +5,14 @@ from typing import List, NamedTuple, Optional, cast


 class FileEncoding(NamedTuple):
+    """A file encoding as the NamedTuple."""
+
    encoding: Optional[str]
+    """The encoding of the file."""
    confidence: float
+    """The confidence of the encoding."""
    language: Optional[str]
+    """The language of the file."""


 def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
@ -15,6 +20,10 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding

    Returns a list of `FileEncoding` tuples with the detected encodings ordered
    by confidence.
+
+    Args:
+        file_path: The path to the file to detect the encoding for.
+        timeout: The timeout in seconds for the encoding detection.
    """
    import chardet

--- a/langchain/document_loaders/hn.py
+++ b/langchain/document_loaders/hn.py
@ -1,4 +1,4 @@
-"""Loader that loads HN."""
+"""Loader that loads Hacker News."""
 from typing import Any, List

 from langchain.docstore.document import Document
@ -11,7 +11,7 @@ class HNLoader(WebBaseLoader):
    def load(self) -> List[Document]:
        """Get important HN webpage information.

-        Components are:
+        HN webpage components are:
            - title
            - content
            - source url,
--- a/langchain/document_loaders/html_bs.py
+++ b/langchain/document_loaders/html_bs.py
@ -20,11 +20,18 @@ class BSHTMLLoader(BaseLoader):
        get_text_separator: str = "",
    ) -> None:
        """Initialise with path, and optionally, file encoding to use, and any kwargs
-        to pass to the BeautifulSoup object."""
+        to pass to the BeautifulSoup object.
+
+        Args:
+            file_path: The path to the file to load.
+            open_encoding: The encoding to use when opening the file.
+            bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
+            get_text_separator: The separator to use when calling get_text on the soup.
+        """
        try:
            import bs4  # noqa:F401
        except ImportError:
-            raise ValueError(
+            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )
@ -37,9 +44,9 @@ class BSHTMLLoader(BaseLoader):
        self.get_text_separator = get_text_separator

    def load(self) -> List[Document]:
+        """Load HTML document into document objects."""
        from bs4 import BeautifulSoup

-        """Load HTML document into document objects."""
        with open(self.file_path, "r", encoding=self.open_encoding) as f:
            soup = BeautifulSoup(f, **self.bs_kwargs)

--- a/langchain/document_loaders/hugging_face_dataset.py
+++ b/langchain/document_loaders/hugging_face_dataset.py
@ -1,4 +1,4 @@
-"""Loader that loads HuggingFace datasets."""
+"""Loads HuggingFace datasets."""
 from typing import Iterator, List, Mapping, Optional, Sequence, Union

 from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader


 class HuggingFaceDatasetLoader(BaseLoader):
-    """Loading logic for loading documents from the Hugging Face Hub."""
+    """Load Documents from the Hugging Face Hub."""

    def __init__(
        self,
@ -27,14 +27,15 @@ class HuggingFaceDatasetLoader(BaseLoader):

        Args:
            path: Path or name of the dataset.
-            page_content_column: Page content column name.
+            page_content_column: Page content column name. Default is "text".
            name: Name of the dataset configuration.
            data_dir: Data directory of the dataset configuration.
            data_files: Path(s) to source data file(s).
            cache_dir: Directory to read/write data.
            keep_in_memory: Whether to copy the dataset in-memory.
            save_infos: Save the dataset information (checksums/size/splits/...).
-            use_auth_token: Bearer token for remote files on the Datasets Hub.
+              Default is False.
+            use_auth_token: Bearer token for remote files on the Dataset Hub.
            num_proc: Number of processes.
        """

--- a/langchain/document_loaders/ifixit.py
+++ b/langchain/document_loaders/ifixit.py
@ -22,7 +22,7 @@ class IFixitLoader(BaseLoader):
    """

    def __init__(self, web_path: str):
-        """Initialize with web path."""
+        """Initialize with a web path."""
        if not web_path.startswith("https://www.ifixit.com"):
            raise ValueError("web path must start with 'https://www.ifixit.com'")

@ -60,6 +60,16 @@ class IFixitLoader(BaseLoader):

    @staticmethod
    def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
+        """Load suggestions.
+
+        Args:
+            query: A query string
+            doc_type: The type of document to search for. Can be one of "all",
+              "device", "guide", "teardown", "answer", "wiki".
+
+        Returns:
+
+        """
        res = requests.get(
            IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
        )
@ -89,6 +99,14 @@ class IFixitLoader(BaseLoader):
    def load_questions_and_answers(
        self, url_override: Optional[str] = None
    ) -> List[Document]:
+        """Load a list of questions and answers.
+
+        Args:
+            url_override: A URL to override the default URL.
+
+        Returns: List[Document]
+
+        """
        loader = WebBaseLoader(self.web_path if url_override is None else url_override)
        soup = loader.scrape()

@ -125,6 +143,16 @@ class IFixitLoader(BaseLoader):
    def load_device(
        self, url_override: Optional[str] = None, include_guides: bool = True
    ) -> List[Document]:
+        """Loads a device
+
+        Args:
+            url_override: A URL to override the default URL.
+            include_guides: Whether to include guides linked to from the device.
+              Defaults to True.
+
+        Returns:
+
+        """
        documents = []
        if url_override is None:
            url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
@ -153,6 +181,14 @@ class IFixitLoader(BaseLoader):
        return documents

    def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
+        """Load a guide
+
+        Args:
+            url_override: A URL to override the default URL.
+
+        Returns: List[Document]
+
+        """
        if url_override is None:
            url = IFIXIT_BASE_URL + "/guides/" + self.id
        else:
--- a/langchain/document_loaders/image_captions.py
+++ b/langchain/document_loaders/image_captions.py
@ -1,5 +1,5 @@
-"""
-Loader that loads image captions
+"""Loads image captions.
+
 By default, the loader utilizes the pre-trained BLIP image captioning model.
 https://huggingface.co/Salesforce/blip-image-captioning-base

@ -13,7 +13,7 @@ from langchain.document_loaders.base import BaseLoader


 class ImageCaptionLoader(BaseLoader):
-    """Loader that loads the captions of an image"""
+    """Loads the captions of an image"""

    def __init__(
        self,
@ -23,6 +23,11 @@ class ImageCaptionLoader(BaseLoader):
    ):
        """
        Initialize with a list of image paths
+
+        Args:
+            path_images: A list of image paths.
+            blip_processor: The name of the pre-trained BLIP processor.
+            blip_model: The name of the pre-trained BLIP model.
        """
        if isinstance(path_images, str):
            self.image_paths = [path_images]
--- a/langchain/document_loaders/imsdb.py
+++ b/langchain/document_loaders/imsdb.py
@ -1,4 +1,4 @@
-"""Loader that loads IMSDb."""
+"""Loads IMSDb."""
 from typing import List

 from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader


 class IMSDbLoader(WebBaseLoader):
-    """Loader that loads IMSDb webpages."""
+    """Loads IMSDb webpages."""

    def load(self) -> List[Document]:
        """Load webpage."""
--- a/langchain/document_loaders/iugu.py
+++ b/langchain/document_loaders/iugu.py
@ -20,6 +20,12 @@ class IuguLoader(BaseLoader):
    """Loader that fetches data from IUGU."""

    def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
+        """Initialize the IUGU resource.
+
+        Args:
+            resource: The name of the resource to fetch.
+            api_token: The IUGU API token to use.
+        """
        self.resource = resource
        api_token = api_token or get_from_env("api_token", "IUGU_API_TOKEN")
        self.headers = {"Authorization": f"Bearer {api_token}"}
--- a/langchain/document_loaders/joplin.py
+++ b/langchain/document_loaders/joplin.py
@ -30,6 +30,14 @@ class JoplinLoader(BaseLoader):
        port: int = 41184,
        host: str = "localhost",
    ) -> None:
+        """
+
+        Args:
+            access_token: The access token to use.
+            port: The port where the Web Clipper service is running. Default is 41184.
+            host: The host where the Web Clipper service is running.
+                Default is localhost.
+        """
        access_token = access_token or get_from_env(
            "access_token", "JOPLIN_ACCESS_TOKEN"
        )
--- a/langchain/document_loaders/json_loader.py
+++ b/langchain/document_loaders/json_loader.py
@ -1,4 +1,4 @@
-"""Loader that loads data from JSON."""
+"""Loads data from JSON."""
 import json
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union
@ -8,8 +8,7 @@ from langchain.document_loaders.base import BaseLoader


 class JSONLoader(BaseLoader):
-    """Loads a JSON file and references a jq schema provided to load the text into
-    documents.
+    """Loads a JSON file using a jq schema.

    Example:
        [{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
@ -101,7 +100,7 @@ class JSONLoader(BaseLoader):
            return str(content) if content is not None else ""

    def _validate_content_key(self, data: Any) -> None:
-        """Check if content key is valid"""
+        """Check if a content key is valid"""
        sample = data.first()
        if not isinstance(sample, dict):
            raise ValueError(
--- a/langchain/document_loaders/larksuite.py
+++ b/langchain/document_loaders/larksuite.py
@ -1,4 +1,4 @@
-"""Loader that loads LarkSuite (FeiShu) document json dump."""
+"""Loads LarkSuite (FeiShu) document json dump."""
 import json
 import urllib.request
 from typing import Any, Iterator, List
@ -8,10 +8,16 @@ from langchain.document_loaders.base import BaseLoader


 class LarkSuiteDocLoader(BaseLoader):
-    """Loader that loads LarkSuite (FeiShu) document."""
+    """Loads LarkSuite (FeiShu) document."""

    def __init__(self, domain: str, access_token: str, document_id: str):
-        """Initialize with domain, access_token (tenant / user), and document_id."""
+        """Initialize with domain, access_token (tenant / user), and document_id.
+
+        Args:
+            domain: The domain to load the LarkSuite.
+            access_token: The access_token to use.
+            document_id: The document_id to load.
+        """
        self.domain = domain
        self.access_token = access_token
        self.document_id = document_id
--- a/langchain/document_loaders/markdown.py
+++ b/langchain/document_loaders/markdown.py
@ -1,4 +1,4 @@
-"""Loader that loads Markdown files."""
+"""Loads Markdown files."""
 from typing import List

 from langchain.document_loaders.unstructured import UnstructuredFileLoader
--- a/langchain/document_loaders/mastodon.py
+++ b/langchain/document_loaders/mastodon.py
@ -15,7 +15,7 @@ def _dependable_mastodon_import() -> mastodon:
    try:
        import mastodon
    except ImportError:
-        raise ValueError(
+        raise ImportError(
            "Mastodon.py package not found, "
            "please install it with `pip install Mastodon.py`"
        )
@ -37,11 +37,13 @@ class MastodonTootsLoader(BaseLoader):

        Args:
            mastodon_accounts: The list of Mastodon accounts to query.
-            number_toots: How many toots to pull for each account.
+            number_toots: How many toots to pull for each account. Default is 100.
            exclude_replies: Whether to exclude reply toots from the load.
+                Default is False.
            access_token: An access token if toots are loaded as a Mastodon app. Can
                also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
            api_base_url: A Mastodon API base URL to talk to, if not using the default.
+                Default is "https://mastodon.social".
        """
        mastodon = _dependable_mastodon_import()
        access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
--- a/langchain/document_loaders/mediawikidump.py
+++ b/langchain/document_loaders/mediawikidump.py
@ -32,12 +32,17 @@ class MWDumpLoader(BaseLoader):
    """

    def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
-        """Initialize with file path."""
+        """Initialize with a file path.
+
+        Args:
+            file_path: XML local file path
+            encoding: Charset encoding, defaults to "utf8"
+        """
        self.file_path = file_path
        self.encoding = encoding

    def load(self) -> List[Document]:
-        """Load from file path."""
+        """Load from a file path."""
        import mwparserfromhell
        import mwxml

--- a/langchain/document_loaders/mhtml.py
+++ b/langchain/document_loaders/mhtml.py
@ -1,4 +1,4 @@
-"""Loader to load MHTML files, enriching metadata with page title."""
+"""Load MHTML files, enriching metadata with page title."""

 import email
 import logging
@ -21,11 +21,18 @@ class MHTMLLoader(BaseLoader):
        get_text_separator: str = "",
    ) -> None:
        """Initialise with path, and optionally, file encoding to use, and any kwargs
-        to pass to the BeautifulSoup object."""
+        to pass to the BeautifulSoup object.
+
+        Args:
+            file_path: The path to the file to load.
+            open_encoding: The encoding to use when opening the file.
+            bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
+            get_text_separator: The separator to use when getting text from the soup.
+        """
        try:
            import bs4  # noqa:F401
        except ImportError:
-            raise ValueError(
+            raise ImportError(
                "beautifulsoup4 package not found, please install it with "
                "`pip install beautifulsoup4`"
            )