mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 08:33:49 +00:00
docstrings: document_loaders
consistency 3 (#9216)
Updated docstrings into the consistent format (probably, the last update for the `document_loaders`.
This commit is contained in:
parent
a69cb95850
commit
93dd499997
@ -62,7 +62,7 @@ class AirbyteCDKLoader(BaseLoader):
|
||||
|
||||
|
||||
class AirbyteHubspotLoader(AirbyteCDKLoader):
|
||||
"""Loads records from Hubspot using an Airbyte source connector."""
|
||||
"""Load from `Hubspot` using an `Airbyte` source connector."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -94,7 +94,7 @@ class AirbyteHubspotLoader(AirbyteCDKLoader):
|
||||
|
||||
|
||||
class AirbyteStripeLoader(AirbyteCDKLoader):
|
||||
"""Loads records from Stripe using an Airbyte source connector."""
|
||||
"""Load from `Stripe` using an `Airbyte` source connector."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -126,7 +126,7 @@ class AirbyteStripeLoader(AirbyteCDKLoader):
|
||||
|
||||
|
||||
class AirbyteTypeformLoader(AirbyteCDKLoader):
|
||||
"""Loads records from Typeform using an Airbyte source connector."""
|
||||
"""Load from `Typeform` using an `Airbyte` source connector."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -158,7 +158,7 @@ class AirbyteTypeformLoader(AirbyteCDKLoader):
|
||||
|
||||
|
||||
class AirbyteZendeskSupportLoader(AirbyteCDKLoader):
|
||||
"""Loads records from Zendesk Support using an Airbyte source connector."""
|
||||
"""Load from `Zendesk Support` using an `Airbyte` source connector."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -190,7 +190,7 @@ class AirbyteZendeskSupportLoader(AirbyteCDKLoader):
|
||||
|
||||
|
||||
class AirbyteShopifyLoader(AirbyteCDKLoader):
|
||||
"""Loads records from Shopify using an Airbyte source connector."""
|
||||
"""Load from `Shopify` using an `Airbyte` source connector."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -222,7 +222,7 @@ class AirbyteShopifyLoader(AirbyteCDKLoader):
|
||||
|
||||
|
||||
class AirbyteSalesforceLoader(AirbyteCDKLoader):
|
||||
"""Loads records from Salesforce using an Airbyte source connector."""
|
||||
"""Load from `Salesforce` using an `Airbyte` source connector."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -254,7 +254,7 @@ class AirbyteSalesforceLoader(AirbyteCDKLoader):
|
||||
|
||||
|
||||
class AirbyteGongLoader(AirbyteCDKLoader):
|
||||
"""Loads records from Gong using an Airbyte source connector."""
|
||||
"""Load from `Gong` using an `Airbyte` source connector."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -37,7 +37,7 @@ def _make_iterator(
|
||||
|
||||
|
||||
class FileSystemBlobLoader(BlobLoader):
|
||||
"""Blob loader for the local file system.
|
||||
"""Load blobs in the local file system.
|
||||
|
||||
Example:
|
||||
|
||||
@ -58,7 +58,7 @@ class FileSystemBlobLoader(BlobLoader):
|
||||
suffixes: Optional[Sequence[str]] = None,
|
||||
show_progress: bool = False,
|
||||
) -> None:
|
||||
"""Initialize with path to directory and how to glob over it.
|
||||
"""Initialize with a path to directory and how to glob over it.
|
||||
|
||||
Args:
|
||||
path: Path to directory to load from
|
||||
|
@ -19,7 +19,7 @@ PathLike = Union[str, PurePath]
|
||||
|
||||
|
||||
class Blob(BaseModel):
|
||||
"""A blob is used to represent raw data by either reference or value.
|
||||
"""Blob represents raw data by either reference or value.
|
||||
|
||||
Provides an interface to materialize the blob in different representations, and
|
||||
help to decouple the development of data loaders from the downstream parsing of
|
||||
|
@ -9,8 +9,8 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class AsyncChromiumLoader(BaseLoader):
|
||||
"""Scrape HTML content from provided URLs using a
|
||||
headless instance of the Chromium browser."""
|
||||
"""Scrape HTML pages from URLs using a
|
||||
headless instance of the Chromium."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -78,7 +78,9 @@ class CSVLoader(BaseLoader):
|
||||
|
||||
|
||||
class UnstructuredCSVLoader(UnstructuredFileLoader):
|
||||
"""Loader that uses unstructured to load CSV files. Like other
|
||||
"""Load `CSV` files using `Unstructured`.
|
||||
|
||||
Like other
|
||||
Unstructured loaders, UnstructuredCSVLoader can be used in both
|
||||
"single" and "elements" mode. If you use the loader in "elements"
|
||||
mode, the CSV file will be a single Unstructured Table element.
|
||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import (
|
||||
|
||||
|
||||
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||
"""Load email files with `unstructured`.
|
||||
"""Load email files using `Unstructured`.
|
||||
|
||||
Works with both
|
||||
.eml and .msg files. You can process attachments in addition to the
|
||||
|
@ -52,14 +52,14 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):
|
||||
|
||||
|
||||
class BaseEmbaasLoader(BaseModel):
|
||||
"""Base loader for embedding a model into an `Embaas` document extraction API."""
|
||||
"""Base loader for `Embaas` document extraction API."""
|
||||
|
||||
embaas_api_key: Optional[str] = None
|
||||
"""The API key for the embaas document extraction API."""
|
||||
"""The API key for the Embaas document extraction API."""
|
||||
api_url: str = EMBAAS_DOC_API_URL
|
||||
"""The URL of the embaas document extraction API."""
|
||||
"""The URL of the Embaas document extraction API."""
|
||||
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
|
||||
"""Additional parameters to pass to the embaas document extraction API."""
|
||||
"""Additional parameters to pass to the Embaas document extraction API."""
|
||||
|
||||
@root_validator(pre=True)
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
@ -163,13 +163,13 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
||||
except requests.exceptions.RequestException as e:
|
||||
if e.response is None or not e.response.text:
|
||||
raise ValueError(
|
||||
f"Error raised by embaas document text extraction API: {e}"
|
||||
f"Error raised by Embaas document text extraction API: {e}"
|
||||
)
|
||||
|
||||
parsed_response = e.response.json()
|
||||
if "message" in parsed_response:
|
||||
raise ValueError(
|
||||
f"Validation Error raised by embaas document text extraction API:"
|
||||
f"Validation Error raised by Embaas document text extraction API:"
|
||||
f" {parsed_response['message']}"
|
||||
)
|
||||
raise
|
||||
|
@ -5,7 +5,7 @@ from typing import List, NamedTuple, Optional, cast
|
||||
|
||||
|
||||
class FileEncoding(NamedTuple):
|
||||
"""A file encoding as the NamedTuple."""
|
||||
"""File encoding as the NamedTuple."""
|
||||
|
||||
encoding: Optional[str]
|
||||
"""The encoding of the file."""
|
||||
|
@ -56,7 +56,7 @@ def concatenate_cells(
|
||||
|
||||
|
||||
def remove_newlines(x: Any) -> Any:
|
||||
"""Recursively removes newlines, no matter the data structure they are stored in."""
|
||||
"""Recursively remove newlines, no matter the data structure they are stored in."""
|
||||
import pandas as pd
|
||||
|
||||
if isinstance(x, str):
|
||||
|
@ -10,7 +10,7 @@ from langchain.schema import Document
|
||||
|
||||
|
||||
class MimeTypeBasedParser(BaseBlobParser):
|
||||
"""A parser that uses mime-types to determine how to parse a blob.
|
||||
"""Parser that uses `mime`-types to parse a blob.
|
||||
|
||||
This parser is useful for simple pipelines where the mime-type is sufficient
|
||||
to determine how to parse a blob.
|
||||
|
@ -11,13 +11,13 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ServerUnavailableException(Exception):
|
||||
"""Exception raised when the GROBID server is unavailable."""
|
||||
"""Exception raised when the Grobid server is unavailable."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class GrobidParser(BaseBlobParser):
|
||||
"""Loader that uses Grobid to load article PDF files."""
|
||||
"""Load article `PDF` files using `Grobid`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BS4HTMLParser(BaseBlobParser):
|
||||
"""Parser that uses beautiful soup to parse HTML files."""
|
||||
"""Pparse HTML files using `Beautiful Soup`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -3,7 +3,7 @@ from typing import List
|
||||
|
||||
|
||||
class CodeSegmenter(ABC):
|
||||
"""The abstract class for the code segmenter."""
|
||||
"""Abstract class for the code segmenter."""
|
||||
|
||||
def __init__(self, code: str):
|
||||
self.code = code
|
||||
|
@ -4,7 +4,7 @@ from langchain.document_loaders.parsers.language.code_segmenter import CodeSegme
|
||||
|
||||
|
||||
class JavaScriptSegmenter(CodeSegmenter):
|
||||
"""The code segmenter for JavaScript."""
|
||||
"""Code segmenter for JavaScript."""
|
||||
|
||||
def __init__(self, code: str):
|
||||
super().__init__(code)
|
||||
|
@ -19,8 +19,7 @@ LANGUAGE_SEGMENTERS: Dict[str, Any] = {
|
||||
|
||||
|
||||
class LanguageParser(BaseBlobParser):
|
||||
"""
|
||||
Language parser that split code using the respective language syntax.
|
||||
"""Parse using the respective programming language syntax.
|
||||
|
||||
Each top-level function and class in the code is loaded into separate documents.
|
||||
Furthermore, an extra document is generated, containing the remaining top-level code
|
||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.parsers.language.code_segmenter import CodeSegme
|
||||
|
||||
|
||||
class PythonSegmenter(CodeSegmenter):
|
||||
"""The code segmenter for Python."""
|
||||
"""Code segmenter for `Python`."""
|
||||
|
||||
def __init__(self, code: str):
|
||||
super().__init__(code)
|
||||
|
@ -8,7 +8,7 @@ from langchain.schema import Document
|
||||
|
||||
|
||||
class PyPDFParser(BaseBlobParser):
|
||||
"""Loads a PDF with pypdf and chunks at character level."""
|
||||
"""Load `PDF` using `pypdf` and chunk at character level."""
|
||||
|
||||
def __init__(self, password: Optional[Union[str, bytes]] = None):
|
||||
self.password = password
|
||||
@ -29,7 +29,7 @@ class PyPDFParser(BaseBlobParser):
|
||||
|
||||
|
||||
class PDFMinerParser(BaseBlobParser):
|
||||
"""Parse PDFs with PDFMiner."""
|
||||
"""Parse `PDF` using `PDFMiner`."""
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Lazily parse the blob."""
|
||||
@ -42,7 +42,7 @@ class PDFMinerParser(BaseBlobParser):
|
||||
|
||||
|
||||
class PyMuPDFParser(BaseBlobParser):
|
||||
"""Parse PDFs with PyMuPDF."""
|
||||
"""Parse `PDF` using `PyMuPDF`."""
|
||||
|
||||
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
||||
"""Initialize the parser.
|
||||
@ -81,7 +81,7 @@ class PyMuPDFParser(BaseBlobParser):
|
||||
|
||||
|
||||
class PyPDFium2Parser(BaseBlobParser):
|
||||
"""Parse PDFs with PyPDFium2."""
|
||||
"""Parse `PDF` with `PyPDFium2`."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
"""Initialize the parser."""
|
||||
@ -114,7 +114,7 @@ class PyPDFium2Parser(BaseBlobParser):
|
||||
|
||||
|
||||
class PDFPlumberParser(BaseBlobParser):
|
||||
"""Parse PDFs with PDFPlumber."""
|
||||
"""Parse `PDF` with `PDFPlumber`."""
|
||||
|
||||
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
||||
"""Initialize the parser.
|
||||
@ -153,7 +153,7 @@ class PDFPlumberParser(BaseBlobParser):
|
||||
|
||||
|
||||
class AmazonTextractPDFParser(BaseBlobParser):
|
||||
"""Sends PDF files to Amazon Textract and parses them to generate Documents.
|
||||
"""Send `PDF` files to `Amazon Textract` and parse them.
|
||||
|
||||
For parsing multi-page PDFs, they have to reside on S3.
|
||||
"""
|
||||
|
@ -459,7 +459,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
||||
|
||||
|
||||
class AmazonTextractPDFLoader(BasePDFLoader):
|
||||
""" "Load `PDF` files from a local file system, HTTP or S3.
|
||||
"""Load `PDF` files from a local file system, HTTP or S3.
|
||||
|
||||
To authenticate, the AWS client uses the following methods to
|
||||
automatically load credentials:
|
||||
|
@ -47,7 +47,7 @@ class TelegramChatFileLoader(BaseLoader):
|
||||
|
||||
|
||||
def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
|
||||
"""Converts a string or list of strings to a list of Documents with metadata."""
|
||||
"""Convert a string or list of strings to a list of Documents with metadata."""
|
||||
if isinstance(text, str):
|
||||
# Take a single string as one page
|
||||
text = [text]
|
||||
@ -78,7 +78,7 @@ def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
|
||||
|
||||
|
||||
class TelegramChatApiLoader(BaseLoader):
|
||||
"""Loads Telegram chat json directory dump."""
|
||||
"""Load `Telegram` chat json directory dump."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
|
@ -8,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def satisfies_min_unstructured_version(min_version: str) -> bool:
|
||||
"""Checks to see if the installed unstructured version exceeds the minimum version
|
||||
"""Check if the installed `Unstructured` version exceeds the minimum version
|
||||
for the feature in question."""
|
||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||
|
||||
@ -25,7 +25,7 @@ def satisfies_min_unstructured_version(min_version: str) -> bool:
|
||||
|
||||
|
||||
def validate_unstructured_version(min_unstructured_version: str) -> None:
|
||||
"""Raises an error if the unstructured version does not exceed the
|
||||
"""Raise an error if the `Unstructured` version does not exceed the
|
||||
specified minimum."""
|
||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||
raise ValueError(
|
||||
@ -34,7 +34,7 @@ def validate_unstructured_version(min_unstructured_version: str) -> None:
|
||||
|
||||
|
||||
class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||
"""Loader that uses Unstructured to load files."""
|
||||
"""Base Loader that uses `Unstructured`."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -181,7 +181,7 @@ def get_elements_from_api(
|
||||
api_key: str = "",
|
||||
**unstructured_kwargs: Any,
|
||||
) -> List:
|
||||
"""Retrieves a list of elements from the Unstructured API."""
|
||||
"""Retrieve a list of elements from the `Unstructured API`."""
|
||||
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
|
||||
from unstructured.partition.api import partition_multiple_via_api
|
||||
|
||||
|
@ -19,7 +19,7 @@ SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]
|
||||
|
||||
@dataclass
|
||||
class GoogleApiClient:
|
||||
"""A Generic Google Api Client.
|
||||
"""Generic Google API Client.
|
||||
|
||||
To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
|
||||
python package installed.
|
||||
|
Loading…
Reference in New Issue
Block a user