mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 00:48:45 +00:00
docstrings: document_loaders
consistency 3 (#9216)
Updated docstrings into the consistent format (probably, the last update for the `document_loaders`.
This commit is contained in:
parent
a69cb95850
commit
93dd499997
@ -62,7 +62,7 @@ class AirbyteCDKLoader(BaseLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteHubspotLoader(AirbyteCDKLoader):
|
class AirbyteHubspotLoader(AirbyteCDKLoader):
|
||||||
"""Loads records from Hubspot using an Airbyte source connector."""
|
"""Load from `Hubspot` using an `Airbyte` source connector."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -94,7 +94,7 @@ class AirbyteHubspotLoader(AirbyteCDKLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteStripeLoader(AirbyteCDKLoader):
|
class AirbyteStripeLoader(AirbyteCDKLoader):
|
||||||
"""Loads records from Stripe using an Airbyte source connector."""
|
"""Load from `Stripe` using an `Airbyte` source connector."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -126,7 +126,7 @@ class AirbyteStripeLoader(AirbyteCDKLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteTypeformLoader(AirbyteCDKLoader):
|
class AirbyteTypeformLoader(AirbyteCDKLoader):
|
||||||
"""Loads records from Typeform using an Airbyte source connector."""
|
"""Load from `Typeform` using an `Airbyte` source connector."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -158,7 +158,7 @@ class AirbyteTypeformLoader(AirbyteCDKLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteZendeskSupportLoader(AirbyteCDKLoader):
|
class AirbyteZendeskSupportLoader(AirbyteCDKLoader):
|
||||||
"""Loads records from Zendesk Support using an Airbyte source connector."""
|
"""Load from `Zendesk Support` using an `Airbyte` source connector."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -190,7 +190,7 @@ class AirbyteZendeskSupportLoader(AirbyteCDKLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteShopifyLoader(AirbyteCDKLoader):
|
class AirbyteShopifyLoader(AirbyteCDKLoader):
|
||||||
"""Loads records from Shopify using an Airbyte source connector."""
|
"""Load from `Shopify` using an `Airbyte` source connector."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -222,7 +222,7 @@ class AirbyteShopifyLoader(AirbyteCDKLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteSalesforceLoader(AirbyteCDKLoader):
|
class AirbyteSalesforceLoader(AirbyteCDKLoader):
|
||||||
"""Loads records from Salesforce using an Airbyte source connector."""
|
"""Load from `Salesforce` using an `Airbyte` source connector."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -254,7 +254,7 @@ class AirbyteSalesforceLoader(AirbyteCDKLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteGongLoader(AirbyteCDKLoader):
|
class AirbyteGongLoader(AirbyteCDKLoader):
|
||||||
"""Loads records from Gong using an Airbyte source connector."""
|
"""Load from `Gong` using an `Airbyte` source connector."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -37,7 +37,7 @@ def _make_iterator(
|
|||||||
|
|
||||||
|
|
||||||
class FileSystemBlobLoader(BlobLoader):
|
class FileSystemBlobLoader(BlobLoader):
|
||||||
"""Blob loader for the local file system.
|
"""Load blobs in the local file system.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
@ -58,7 +58,7 @@ class FileSystemBlobLoader(BlobLoader):
|
|||||||
suffixes: Optional[Sequence[str]] = None,
|
suffixes: Optional[Sequence[str]] = None,
|
||||||
show_progress: bool = False,
|
show_progress: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with path to directory and how to glob over it.
|
"""Initialize with a path to directory and how to glob over it.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: Path to directory to load from
|
path: Path to directory to load from
|
||||||
|
@ -19,7 +19,7 @@ PathLike = Union[str, PurePath]
|
|||||||
|
|
||||||
|
|
||||||
class Blob(BaseModel):
|
class Blob(BaseModel):
|
||||||
"""A blob is used to represent raw data by either reference or value.
|
"""Blob represents raw data by either reference or value.
|
||||||
|
|
||||||
Provides an interface to materialize the blob in different representations, and
|
Provides an interface to materialize the blob in different representations, and
|
||||||
help to decouple the development of data loaders from the downstream parsing of
|
help to decouple the development of data loaders from the downstream parsing of
|
||||||
|
@ -9,8 +9,8 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class AsyncChromiumLoader(BaseLoader):
|
class AsyncChromiumLoader(BaseLoader):
|
||||||
"""Scrape HTML content from provided URLs using a
|
"""Scrape HTML pages from URLs using a
|
||||||
headless instance of the Chromium browser."""
|
headless instance of the Chromium."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -78,7 +78,9 @@ class CSVLoader(BaseLoader):
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredCSVLoader(UnstructuredFileLoader):
|
class UnstructuredCSVLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load CSV files. Like other
|
"""Load `CSV` files using `Unstructured`.
|
||||||
|
|
||||||
|
Like other
|
||||||
Unstructured loaders, UnstructuredCSVLoader can be used in both
|
Unstructured loaders, UnstructuredCSVLoader can be used in both
|
||||||
"single" and "elements" mode. If you use the loader in "elements"
|
"single" and "elements" mode. If you use the loader in "elements"
|
||||||
mode, the CSV file will be a single Unstructured Table element.
|
mode, the CSV file will be a single Unstructured Table element.
|
||||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||||
"""Load email files with `unstructured`.
|
"""Load email files using `Unstructured`.
|
||||||
|
|
||||||
Works with both
|
Works with both
|
||||||
.eml and .msg files. You can process attachments in addition to the
|
.eml and .msg files. You can process attachments in addition to the
|
||||||
|
@ -52,14 +52,14 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):
|
|||||||
|
|
||||||
|
|
||||||
class BaseEmbaasLoader(BaseModel):
|
class BaseEmbaasLoader(BaseModel):
|
||||||
"""Base loader for embedding a model into an `Embaas` document extraction API."""
|
"""Base loader for `Embaas` document extraction API."""
|
||||||
|
|
||||||
embaas_api_key: Optional[str] = None
|
embaas_api_key: Optional[str] = None
|
||||||
"""The API key for the embaas document extraction API."""
|
"""The API key for the Embaas document extraction API."""
|
||||||
api_url: str = EMBAAS_DOC_API_URL
|
api_url: str = EMBAAS_DOC_API_URL
|
||||||
"""The URL of the embaas document extraction API."""
|
"""The URL of the Embaas document extraction API."""
|
||||||
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
|
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
|
||||||
"""Additional parameters to pass to the embaas document extraction API."""
|
"""Additional parameters to pass to the Embaas document extraction API."""
|
||||||
|
|
||||||
@root_validator(pre=True)
|
@root_validator(pre=True)
|
||||||
def validate_environment(cls, values: Dict) -> Dict:
|
def validate_environment(cls, values: Dict) -> Dict:
|
||||||
@ -163,13 +163,13 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
|||||||
except requests.exceptions.RequestException as e:
|
except requests.exceptions.RequestException as e:
|
||||||
if e.response is None or not e.response.text:
|
if e.response is None or not e.response.text:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Error raised by embaas document text extraction API: {e}"
|
f"Error raised by Embaas document text extraction API: {e}"
|
||||||
)
|
)
|
||||||
|
|
||||||
parsed_response = e.response.json()
|
parsed_response = e.response.json()
|
||||||
if "message" in parsed_response:
|
if "message" in parsed_response:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Validation Error raised by embaas document text extraction API:"
|
f"Validation Error raised by Embaas document text extraction API:"
|
||||||
f" {parsed_response['message']}"
|
f" {parsed_response['message']}"
|
||||||
)
|
)
|
||||||
raise
|
raise
|
||||||
|
@ -5,7 +5,7 @@ from typing import List, NamedTuple, Optional, cast
|
|||||||
|
|
||||||
|
|
||||||
class FileEncoding(NamedTuple):
|
class FileEncoding(NamedTuple):
|
||||||
"""A file encoding as the NamedTuple."""
|
"""File encoding as the NamedTuple."""
|
||||||
|
|
||||||
encoding: Optional[str]
|
encoding: Optional[str]
|
||||||
"""The encoding of the file."""
|
"""The encoding of the file."""
|
||||||
|
@ -56,7 +56,7 @@ def concatenate_cells(
|
|||||||
|
|
||||||
|
|
||||||
def remove_newlines(x: Any) -> Any:
|
def remove_newlines(x: Any) -> Any:
|
||||||
"""Recursively removes newlines, no matter the data structure they are stored in."""
|
"""Recursively remove newlines, no matter the data structure they are stored in."""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
if isinstance(x, str):
|
if isinstance(x, str):
|
||||||
|
@ -10,7 +10,7 @@ from langchain.schema import Document
|
|||||||
|
|
||||||
|
|
||||||
class MimeTypeBasedParser(BaseBlobParser):
|
class MimeTypeBasedParser(BaseBlobParser):
|
||||||
"""A parser that uses mime-types to determine how to parse a blob.
|
"""Parser that uses `mime`-types to parse a blob.
|
||||||
|
|
||||||
This parser is useful for simple pipelines where the mime-type is sufficient
|
This parser is useful for simple pipelines where the mime-type is sufficient
|
||||||
to determine how to parse a blob.
|
to determine how to parse a blob.
|
||||||
|
@ -11,13 +11,13 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class ServerUnavailableException(Exception):
|
class ServerUnavailableException(Exception):
|
||||||
"""Exception raised when the GROBID server is unavailable."""
|
"""Exception raised when the Grobid server is unavailable."""
|
||||||
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class GrobidParser(BaseBlobParser):
|
class GrobidParser(BaseBlobParser):
|
||||||
"""Loader that uses Grobid to load article PDF files."""
|
"""Load article `PDF` files using `Grobid`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class BS4HTMLParser(BaseBlobParser):
|
class BS4HTMLParser(BaseBlobParser):
|
||||||
"""Parser that uses beautiful soup to parse HTML files."""
|
"""Pparse HTML files using `Beautiful Soup`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -3,7 +3,7 @@ from typing import List
|
|||||||
|
|
||||||
|
|
||||||
class CodeSegmenter(ABC):
|
class CodeSegmenter(ABC):
|
||||||
"""The abstract class for the code segmenter."""
|
"""Abstract class for the code segmenter."""
|
||||||
|
|
||||||
def __init__(self, code: str):
|
def __init__(self, code: str):
|
||||||
self.code = code
|
self.code = code
|
||||||
|
@ -4,7 +4,7 @@ from langchain.document_loaders.parsers.language.code_segmenter import CodeSegme
|
|||||||
|
|
||||||
|
|
||||||
class JavaScriptSegmenter(CodeSegmenter):
|
class JavaScriptSegmenter(CodeSegmenter):
|
||||||
"""The code segmenter for JavaScript."""
|
"""Code segmenter for JavaScript."""
|
||||||
|
|
||||||
def __init__(self, code: str):
|
def __init__(self, code: str):
|
||||||
super().__init__(code)
|
super().__init__(code)
|
||||||
|
@ -19,8 +19,7 @@ LANGUAGE_SEGMENTERS: Dict[str, Any] = {
|
|||||||
|
|
||||||
|
|
||||||
class LanguageParser(BaseBlobParser):
|
class LanguageParser(BaseBlobParser):
|
||||||
"""
|
"""Parse using the respective programming language syntax.
|
||||||
Language parser that split code using the respective language syntax.
|
|
||||||
|
|
||||||
Each top-level function and class in the code is loaded into separate documents.
|
Each top-level function and class in the code is loaded into separate documents.
|
||||||
Furthermore, an extra document is generated, containing the remaining top-level code
|
Furthermore, an extra document is generated, containing the remaining top-level code
|
||||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.parsers.language.code_segmenter import CodeSegme
|
|||||||
|
|
||||||
|
|
||||||
class PythonSegmenter(CodeSegmenter):
|
class PythonSegmenter(CodeSegmenter):
|
||||||
"""The code segmenter for Python."""
|
"""Code segmenter for `Python`."""
|
||||||
|
|
||||||
def __init__(self, code: str):
|
def __init__(self, code: str):
|
||||||
super().__init__(code)
|
super().__init__(code)
|
||||||
|
@ -8,7 +8,7 @@ from langchain.schema import Document
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFParser(BaseBlobParser):
|
class PyPDFParser(BaseBlobParser):
|
||||||
"""Loads a PDF with pypdf and chunks at character level."""
|
"""Load `PDF` using `pypdf` and chunk at character level."""
|
||||||
|
|
||||||
def __init__(self, password: Optional[Union[str, bytes]] = None):
|
def __init__(self, password: Optional[Union[str, bytes]] = None):
|
||||||
self.password = password
|
self.password = password
|
||||||
@ -29,7 +29,7 @@ class PyPDFParser(BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class PDFMinerParser(BaseBlobParser):
|
class PDFMinerParser(BaseBlobParser):
|
||||||
"""Parse PDFs with PDFMiner."""
|
"""Parse `PDF` using `PDFMiner`."""
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
"""Lazily parse the blob."""
|
"""Lazily parse the blob."""
|
||||||
@ -42,7 +42,7 @@ class PDFMinerParser(BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class PyMuPDFParser(BaseBlobParser):
|
class PyMuPDFParser(BaseBlobParser):
|
||||||
"""Parse PDFs with PyMuPDF."""
|
"""Parse `PDF` using `PyMuPDF`."""
|
||||||
|
|
||||||
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
||||||
"""Initialize the parser.
|
"""Initialize the parser.
|
||||||
@ -81,7 +81,7 @@ class PyMuPDFParser(BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class PyPDFium2Parser(BaseBlobParser):
|
class PyPDFium2Parser(BaseBlobParser):
|
||||||
"""Parse PDFs with PyPDFium2."""
|
"""Parse `PDF` with `PyPDFium2`."""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
"""Initialize the parser."""
|
"""Initialize the parser."""
|
||||||
@ -114,7 +114,7 @@ class PyPDFium2Parser(BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class PDFPlumberParser(BaseBlobParser):
|
class PDFPlumberParser(BaseBlobParser):
|
||||||
"""Parse PDFs with PDFPlumber."""
|
"""Parse `PDF` with `PDFPlumber`."""
|
||||||
|
|
||||||
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
|
||||||
"""Initialize the parser.
|
"""Initialize the parser.
|
||||||
@ -153,7 +153,7 @@ class PDFPlumberParser(BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class AmazonTextractPDFParser(BaseBlobParser):
|
class AmazonTextractPDFParser(BaseBlobParser):
|
||||||
"""Sends PDF files to Amazon Textract and parses them to generate Documents.
|
"""Send `PDF` files to `Amazon Textract` and parse them.
|
||||||
|
|
||||||
For parsing multi-page PDFs, they have to reside on S3.
|
For parsing multi-page PDFs, they have to reside on S3.
|
||||||
"""
|
"""
|
||||||
|
@ -459,7 +459,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
|
|
||||||
|
|
||||||
class AmazonTextractPDFLoader(BasePDFLoader):
|
class AmazonTextractPDFLoader(BasePDFLoader):
|
||||||
""" "Load `PDF` files from a local file system, HTTP or S3.
|
"""Load `PDF` files from a local file system, HTTP or S3.
|
||||||
|
|
||||||
To authenticate, the AWS client uses the following methods to
|
To authenticate, the AWS client uses the following methods to
|
||||||
automatically load credentials:
|
automatically load credentials:
|
||||||
|
@ -47,7 +47,7 @@ class TelegramChatFileLoader(BaseLoader):
|
|||||||
|
|
||||||
|
|
||||||
def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
|
def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
|
||||||
"""Converts a string or list of strings to a list of Documents with metadata."""
|
"""Convert a string or list of strings to a list of Documents with metadata."""
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
# Take a single string as one page
|
# Take a single string as one page
|
||||||
text = [text]
|
text = [text]
|
||||||
@ -78,7 +78,7 @@ def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
|
|||||||
|
|
||||||
|
|
||||||
class TelegramChatApiLoader(BaseLoader):
|
class TelegramChatApiLoader(BaseLoader):
|
||||||
"""Loads Telegram chat json directory dump."""
|
"""Load `Telegram` chat json directory dump."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -8,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
def satisfies_min_unstructured_version(min_version: str) -> bool:
|
def satisfies_min_unstructured_version(min_version: str) -> bool:
|
||||||
"""Checks to see if the installed unstructured version exceeds the minimum version
|
"""Check if the installed `Unstructured` version exceeds the minimum version
|
||||||
for the feature in question."""
|
for the feature in question."""
|
||||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||||
|
|
||||||
@ -25,7 +25,7 @@ def satisfies_min_unstructured_version(min_version: str) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
def validate_unstructured_version(min_unstructured_version: str) -> None:
|
def validate_unstructured_version(min_unstructured_version: str) -> None:
|
||||||
"""Raises an error if the unstructured version does not exceed the
|
"""Raise an error if the `Unstructured` version does not exceed the
|
||||||
specified minimum."""
|
specified minimum."""
|
||||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
@ -34,7 +34,7 @@ def validate_unstructured_version(min_unstructured_version: str) -> None:
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredBaseLoader(BaseLoader, ABC):
|
class UnstructuredBaseLoader(BaseLoader, ABC):
|
||||||
"""Loader that uses Unstructured to load files."""
|
"""Base Loader that uses `Unstructured`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -181,7 +181,7 @@ def get_elements_from_api(
|
|||||||
api_key: str = "",
|
api_key: str = "",
|
||||||
**unstructured_kwargs: Any,
|
**unstructured_kwargs: Any,
|
||||||
) -> List:
|
) -> List:
|
||||||
"""Retrieves a list of elements from the Unstructured API."""
|
"""Retrieve a list of elements from the `Unstructured API`."""
|
||||||
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
|
if isinstance(file, collections.abc.Sequence) or isinstance(file_path, list):
|
||||||
from unstructured.partition.api import partition_multiple_via_api
|
from unstructured.partition.api import partition_multiple_via_api
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ SCOPES = ["https://www.googleapis.com/auth/youtube.readonly"]
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GoogleApiClient:
|
class GoogleApiClient:
|
||||||
"""A Generic Google Api Client.
|
"""Generic Google API Client.
|
||||||
|
|
||||||
To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
|
To use, you should have the ``google_auth_oauthlib,youtube_transcript_api,google``
|
||||||
python package installed.
|
python package installed.
|
||||||
|
Loading…
Reference in New Issue
Block a user