docstrings: document_loaders consistency 3 (#9216)

Updated docstrings into the consistent format (probably, the last update for the `document_loaders`.
2025-06-26 08:33:49 +00:00 · 2023-08-14 16:28:39 -07:00 · 2023-08-14 16:28:39 -07:00 · 93dd499997
commit 93dd499997
parent a69cb95850
21 changed files with 46 additions and 45 deletions
--- a/libs/langchain/langchain/document_loaders/airbyte.py
+++ b/libs/langchain/langchain/document_loaders/airbyte.py
@ -62,7 +62,7 @@ class AirbyteCDKLoader(BaseLoader):


 class AirbyteHubspotLoader(AirbyteCDKLoader):
-    """Loads records from Hubspot using an Airbyte source connector."""
+    """Load from `Hubspot` using an `Airbyte` source connector."""

    def __init__(
        self,
@ -94,7 +94,7 @@ class AirbyteHubspotLoader(AirbyteCDKLoader):


 class AirbyteStripeLoader(AirbyteCDKLoader):
-    """Loads records from Stripe using an Airbyte source connector."""
+    """Load from `Stripe` using an `Airbyte` source connector."""

    def __init__(
        self,
@ -126,7 +126,7 @@ class AirbyteStripeLoader(AirbyteCDKLoader):


 class AirbyteTypeformLoader(AirbyteCDKLoader):
-    """Loads records from Typeform using an Airbyte source connector."""
+    """Load from `Typeform` using an `Airbyte` source connector."""

    def __init__(
        self,
@ -158,7 +158,7 @@ class AirbyteTypeformLoader(AirbyteCDKLoader):


 class AirbyteZendeskSupportLoader(AirbyteCDKLoader):
-    """Loads records from Zendesk Support using an Airbyte source connector."""
+    """Load from `Zendesk Support` using an `Airbyte` source connector."""

    def __init__(
        self,
@ -190,7 +190,7 @@ class AirbyteZendeskSupportLoader(AirbyteCDKLoader):


 class AirbyteShopifyLoader(AirbyteCDKLoader):
-    """Loads records from Shopify using an Airbyte source connector."""
+    """Load from `Shopify` using an `Airbyte` source connector."""

    def __init__(
        self,
@ -222,7 +222,7 @@ class AirbyteShopifyLoader(AirbyteCDKLoader):


 class AirbyteSalesforceLoader(AirbyteCDKLoader):
-    """Loads records from Salesforce using an Airbyte source connector."""
+    """Load from `Salesforce` using an `Airbyte` source connector."""

    def __init__(
        self,
@ -254,7 +254,7 @@ class AirbyteSalesforceLoader(AirbyteCDKLoader):


 class AirbyteGongLoader(AirbyteCDKLoader):
-    """Loads records from Gong using an Airbyte source connector."""
+    """Load from `Gong` using an `Airbyte` source connector."""

    def __init__(
        self,
--- a/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py
+++ b/libs/langchain/langchain/document_loaders/blob_loaders/file_system.py
@ -37,7 +37,7 @@ def _make_iterator(


 class FileSystemBlobLoader(BlobLoader):
-    """Blob loader for the local file system.
+    """Load blobs in the local file system.

    Example:

@ -58,7 +58,7 @@ class FileSystemBlobLoader(BlobLoader):
        suffixes: Optional[Sequence[str]] = None,
        show_progress: bool = False,
    ) -> None:
-        """Initialize with path to directory and how to glob over it.
+        """Initialize with a path to directory and how to glob over it.

        Args:
            path: Path to directory to load from
--- a/libs/langchain/langchain/document_loaders/blob_loaders/schema.py
+++ b/libs/langchain/langchain/document_loaders/blob_loaders/schema.py
@ -19,7 +19,7 @@ PathLike = Union[str, PurePath]


 class Blob(BaseModel):
-    """A blob is used to represent raw data by either reference or value.
+    """Blob represents raw data by either reference or value.

    Provides an interface to materialize the blob in different representations, and
    help to decouple the development of data loaders from the downstream parsing of
--- a/libs/langchain/langchain/document_loaders/chromium.py
+++ b/libs/langchain/langchain/document_loaders/chromium.py
@ -9,8 +9,8 @@ logger = logging.getLogger(__name__)


 class AsyncChromiumLoader(BaseLoader):
-    """Scrape HTML content from provided URLs using a
-    headless instance of the Chromium browser."""
+    """Scrape HTML pages from URLs using a
+    headless instance of the Chromium."""

    def __init__(
        self,
--- a/libs/langchain/langchain/document_loaders/csv_loader.py
+++ b/libs/langchain/langchain/document_loaders/csv_loader.py
@ -78,7 +78,9 @@ class CSVLoader(BaseLoader):


 class UnstructuredCSVLoader(UnstructuredFileLoader):
-    """Loader that uses unstructured to load CSV files. Like other
+    """Load `CSV` files using `Unstructured`.
+
+    Like other
    Unstructured loaders, UnstructuredCSVLoader can be used in both
    "single" and "elements" mode. If you use the loader in "elements"
    mode, the CSV file will be a single Unstructured Table element.
--- a/libs/langchain/langchain/document_loaders/email.py
+++ b/libs/langchain/langchain/document_loaders/email.py
@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import (


 class UnstructuredEmailLoader(UnstructuredFileLoader):
-    """Load email files with `unstructured`.
+    """Load email files using `Unstructured`.

    Works with both
    .eml and .msg files. You can process attachments in addition to the
--- a/libs/langchain/langchain/document_loaders/embaas.py
+++ b/libs/langchain/langchain/document_loaders/embaas.py
@ -52,14 +52,14 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):


 class BaseEmbaasLoader(BaseModel):
-    """Base loader for embedding a model into an `Embaas` document extraction API."""
+    """Base loader for `Embaas` document extraction API."""

    embaas_api_key: Optional[str] = None
-    """The API key for the embaas document extraction API."""
+    """The API key for the Embaas document extraction API."""
    api_url: str = EMBAAS_DOC_API_URL
-    """The URL of the embaas document extraction API."""
+    """The URL of the Embaas document extraction API."""
    params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
-    """Additional parameters to pass to the embaas document extraction API."""
+    """Additional parameters to pass to the Embaas document extraction API."""

    @root_validator(pre=True)
    def validate_environment(cls, values: Dict) -> Dict:
@ -163,13 +163,13 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
        except requests.exceptions.RequestException as e:
            if e.response is None or not e.response.text:
                raise ValueError(
-                    f"Error raised by embaas document text extraction API: {e}"
+                    f"Error raised by Embaas document text extraction API: {e}"
                )

            parsed_response = e.response.json()
            if "message" in parsed_response:
                raise ValueError(
-                    f"Validation Error raised by embaas document text extraction API:"
+                    f"Validation Error raised by Embaas document text extraction API:"
                    f" {parsed_response['message']}"
                )
            raise
--- a/libs/langchain/langchain/document_loaders/helpers.py
+++ b/libs/langchain/langchain/document_loaders/helpers.py
@ -5,7 +5,7 @@ from typing import List, NamedTuple, Optional, cast


 class FileEncoding(NamedTuple):
-    """A file encoding as the NamedTuple."""
+    """File encoding as the NamedTuple."""

    encoding: Optional[str]
    """The encoding of the file."""
--- a/libs/langchain/langchain/document_loaders/notebook.py
+++ b/libs/langchain/langchain/document_loaders/notebook.py
@ -56,7 +56,7 @@ def concatenate_cells(


 def remove_newlines(x: Any) -> Any:
-    """Recursively removes newlines, no matter the data structure they are stored in."""
+    """Recursively remove newlines, no matter the data structure they are stored in."""
    import pandas as pd

    if isinstance(x, str):
--- a/libs/langchain/langchain/document_loaders/parsers/generic.py
+++ b/libs/langchain/langchain/document_loaders/parsers/generic.py
@ -10,7 +10,7 @@ from langchain.schema import Document


 class MimeTypeBasedParser(BaseBlobParser):
-    """A parser that uses mime-types to determine how to parse a blob.
+    """Parser that uses `mime`-types to parse a blob.

    This parser is useful for simple pipelines where the mime-type is sufficient
    to determine how to parse a blob.
--- a/libs/langchain/langchain/document_loaders/parsers/grobid.py
+++ b/libs/langchain/langchain/document_loaders/parsers/grobid.py
@ -11,13 +11,13 @@ logger = logging.getLogger(__name__)


 class ServerUnavailableException(Exception):
-    """Exception raised when the GROBID server is unavailable."""
+    """Exception raised when the Grobid server is unavailable."""

    pass


 class GrobidParser(BaseBlobParser):
-    """Loader that uses Grobid to load article PDF files."""
+    """Load  article `PDF` files using `Grobid`."""

    def __init__(
        self,
--- a/libs/langchain/langchain/document_loaders/parsers/html/bs4.py
+++ b/libs/langchain/langchain/document_loaders/parsers/html/bs4.py
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)


 class BS4HTMLParser(BaseBlobParser):
-    """Parser that uses beautiful soup to parse HTML files."""
+    """Pparse HTML files using `Beautiful Soup`."""

    def __init__(
        self,
--- a/libs/langchain/langchain/document_loaders/parsers/language/code_segmenter.py
+++ b/libs/langchain/langchain/document_loaders/parsers/language/code_segmenter.py
@ -3,7 +3,7 @@ from typing import List


 class CodeSegmenter(ABC):
-    """The abstract class for the code segmenter."""
+    """Abstract class for the code segmenter."""

    def __init__(self, code: str):
        self.code = code
--- a/libs/langchain/langchain/document_loaders/parsers/language/javascript.py
+++ b/libs/langchain/langchain/document_loaders/parsers/language/javascript.py
@ -4,7 +4,7 @@ from langchain.document_loaders.parsers.language.code_segmenter import CodeSegme


 class JavaScriptSegmenter(CodeSegmenter):
-    """The code segmenter for JavaScript."""
+    """Code segmenter for JavaScript."""

    def __init__(self, code: str):
        super().__init__(code)
--- a/libs/langchain/langchain/document_loaders/parsers/language/language_parser.py
+++ b/libs/langchain/langchain/document_loaders/parsers/language/language_parser.py
@ -19,8 +19,7 @@ LANGUAGE_SEGMENTERS: Dict[str, Any] = {


 class LanguageParser(BaseBlobParser):
-    """
-    Language parser that split code using the respective language syntax.
+    """Parse using the respective programming language syntax.

    Each top-level function and class in the code is loaded into separate documents.
    Furthermore, an extra document is generated, containing the remaining top-level code
--- a/libs/langchain/langchain/document_loaders/parsers/language/python.py
+++ b/libs/langchain/langchain/document_loaders/parsers/language/python.py
@ -5,7 +5,7 @@ from langchain.document_loaders.parsers.language.code_segmenter import CodeSegme


 class PythonSegmenter(CodeSegmenter):
-    """The code segmenter for Python."""
+    """Code segmenter for `Python`."""

    def __init__(self, code: str):
        super().__init__(code)
--- a/libs/langchain/langchain/document_loaders/parsers/pdf.py
+++ b/libs/langchain/langchain/document_loaders/parsers/pdf.py
@ -8,7 +8,7 @@ from langchain.schema import Document


 class PyPDFParser(BaseBlobParser):
-    """Loads a PDF with pypdf and chunks at character level."""
+    """Load `PDF` using `pypdf` and chunk at character level."""

    def __init__(self, password: Optional[Union[str, bytes]] = None):
        self.password = password
@ -29,7 +29,7 @@ class PyPDFParser(BaseBlobParser):


 class PDFMinerParser(BaseBlobParser):
-    """Parse PDFs with PDFMiner."""
+    """Parse `PDF` using `PDFMiner`."""

    def lazy_parse(self, blob: Blob) -> Iterator[Document]:
        """Lazily parse the blob."""
@ -42,7 +42,7 @@ class PDFMinerParser(BaseBlobParser):


 class PyMuPDFParser(BaseBlobParser):
-    """Parse PDFs with PyMuPDF."""
+    """Parse `PDF` using `PyMuPDF`."""

    def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
        """Initialize the parser.
@ -81,7 +81,7 @@ class PyMuPDFParser(BaseBlobParser):


 class PyPDFium2Parser(BaseBlobParser):
-    """Parse PDFs with PyPDFium2."""
+    """Parse `PDF` with `PyPDFium2`."""

    def __init__(self) -> None:
        """Initialize the parser."""
@ -114,7 +114,7 @@ class PyPDFium2Parser(BaseBlobParser):


 class PDFPlumberParser(BaseBlobParser):
-    """Parse PDFs with PDFPlumber."""
+    """Parse `PDF` with `PDFPlumber`."""

    def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
        """Initialize the parser.
@ -153,7 +153,7 @@ class PDFPlumberParser(BaseBlobParser):


 class AmazonTextractPDFParser(BaseBlobParser):
-    """Sends PDF files to Amazon Textract and parses them to generate Documents.
+    """Send `PDF` files to `Amazon Textract` and parse them.

    For parsing multi-page PDFs, they have to reside on S3.
    """
--- a/libs/langchain/langchain/document_loaders/pdf.py
+++ b/libs/langchain/langchain/document_loaders/pdf.py
@ -459,7 +459,7 @@ class PDFPlumberLoader(BasePDFLoader):


 class AmazonTextractPDFLoader(BasePDFLoader):
-    """ "Load `PDF` files from a local file system, HTTP or S3.
+    """Load `PDF` files from a local file system, HTTP or S3.

    To authenticate, the AWS client uses the following methods to
    automatically load credentials:
--- a/libs/langchain/langchain/document_loaders/telegram.py
+++ b/libs/langchain/langchain/document_loaders/telegram.py
@ -47,7 +47,7 @@ class TelegramChatFileLoader(BaseLoader):


 def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
-    """Converts a string or list of strings to a list of Documents with metadata."""
+    """Convert a string or list of strings to a list of Documents with metadata."""
    if isinstance(text, str):
        # Take a single string as one page
        text = [text]
@ -78,7 +78,7 @@ def text_to_docs(text: Union[str, List[str]]) -> List[Document]:


 class TelegramChatApiLoader(BaseLoader):
-    """Loads Telegram chat json directory dump."""
+    """Load `Telegram` chat json directory dump."""

    def __init__(
        self,
--- a/libs/langchain/langchain/document_loaders/unstructured.py
+++ b/libs/langchain/langchain/document_loaders/unstructured.py
@ -8,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader


 def satisfies_min_unstructured_version(min_version: str) -> bool:
-    """Checks to see if the installed unstructured version exceeds the minimum version
+    """Check if the installed `Unstructured` version exceeds the minimum version
    for the feature in question."""
    from unstructured.__version__ import __version__ as __unstructured_version__

@ -25,7 +25,7 @@ def satisfies_min_unstructured_version(min_version: str) -> bool:


 def validate_unstructured_version(min_unstructured_version: str) -> None:
-    """Raises an error if the unstructured version does not exceed the
+    """Raise an error if the `Unstructured` version does not exceed the
    specified minimum."""
    if not satisfies_min_unstructured_version(min_unstructured_version):
        raise ValueError(
@ -34,7 +34,7 @@ def validate_unstructured_version(min_unstructured_version: str) -> None:


 class UnstructuredBaseLoader(BaseLoader, ABC):
-    """Loader that uses Unstructured to load files."""
+    """Base Loader that uses `Unstructured`."""

    def __init__(
        self,
@ -181,7 +181,7 @@ def get_elements_from_api(
    api_key: str = "",
    **unstructured_kwargs: Any,
 ) -> List:
-    """Retrieves a list of elements from the Unstructured API."""
+    """Retrieve a list of elements from the `Unstructured API`."""
    if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
        from unstructured.partition.api import partition_multiple_via_api

--- a/libs/langchain/langchain/document_loaders/youtube.py
+++ b/libs/langchain/langchain/document_loaders/youtube.py
@ -19,7 +19,7 @@ SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]

@dataclass
 class GoogleApiClient:
-    """A Generic Google Api Client.
+    """Generic Google API Client.

    To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
    python package installed.