mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-24 23:54:14 +00:00
docstrings document_loaders
2 (#6890)
updated docstring for the `document_loaders` Maintainer responsibilities: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
This commit is contained in:
parent
77ae8084a0
commit
1feac83323
@ -9,7 +9,7 @@ class CollegeConfidentialLoader(WebBaseLoader):
|
||||
"""Loader that loads College Confidential webpages."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
"""Load webpages as Documents."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("main[class='skin-handler']").text
|
||||
metadata = {"source": self.web_path}
|
||||
|
@ -33,8 +33,9 @@ class ContentFormat(str, Enum):
|
||||
|
||||
|
||||
class ConfluenceLoader(BaseLoader):
|
||||
"""
|
||||
Load Confluence pages. Port of https://llamahub.ai/l/confluence
|
||||
"""Load Confluence pages.
|
||||
|
||||
Port of https://llamahub.ai/l/confluence
|
||||
This currently supports username/api_key, Oauth2 login or personal access token
|
||||
authentication.
|
||||
|
||||
@ -175,7 +176,7 @@ class ConfluenceLoader(BaseLoader):
|
||||
"key_cert",
|
||||
]:
|
||||
errors.append(
|
||||
"You have either ommited require keys or added extra "
|
||||
"You have either omitted require keys or added extra "
|
||||
"keys to the oauth2 dictionary. key values should be "
|
||||
"`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`"
|
||||
)
|
||||
@ -343,7 +344,7 @@ class ConfluenceLoader(BaseLoader):
|
||||
doesn't match the limit value. If `limit` is >100 confluence
|
||||
seems to cap the response to 100. Also, due to the Atlassian Python
|
||||
package, we don't get the "next" values from the "_links" key because
|
||||
they only return the value from the results key. So here, the pagination
|
||||
they only return the value from the result key. So here, the pagination
|
||||
starts from 0 and goes until the max_pages, getting the `limit` number
|
||||
of pages with each request. We have to manually check if there
|
||||
are more docs based on the length of the returned list of pages, rather than
|
||||
|
@ -10,11 +10,11 @@ class CoNLLULoader(BaseLoader):
|
||||
"""Load CoNLL-U files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
self.file_path = file_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from file path."""
|
||||
"""Load from a file path."""
|
||||
with open(self.file_path, encoding="utf8") as f:
|
||||
tsv = list(csv.reader(f, delimiter="\t"))
|
||||
|
||||
|
@ -37,6 +37,16 @@ class CSVLoader(BaseLoader):
|
||||
csv_args: Optional[Dict] = None,
|
||||
encoding: Optional[str] = None,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the CSV file.
|
||||
source_column: The name of the column in the CSV file to use as the source.
|
||||
Optional. Defaults to None.
|
||||
csv_args: A dictionary of arguments to pass to the csv.DictReader.
|
||||
Optional. Defaults to None.
|
||||
encoding: The encoding of the CSV file. Optional. Defaults to None.
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.source_column = source_column
|
||||
self.encoding = encoding
|
||||
@ -73,6 +83,14 @@ class UnstructuredCSVLoader(UnstructuredFileLoader):
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the CSV file.
|
||||
mode: The mode to use when loading the CSV file.
|
||||
Optional. Defaults to "single".
|
||||
**unstructured_kwargs: Keyword arguments to pass to unstructured.
|
||||
"""
|
||||
validate_unstructured_version(min_unstructured_version="0.6.8")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Load from Dataframe object"""
|
||||
"""Load from a Dataframe object"""
|
||||
from typing import Any, Iterator, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,10 +6,16 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class DataFrameLoader(BaseLoader):
|
||||
"""Load Pandas DataFrames."""
|
||||
"""Load Pandas DataFrame."""
|
||||
|
||||
def __init__(self, data_frame: Any, page_content_column: str = "text"):
|
||||
"""Initialize with dataframe object."""
|
||||
"""Initialize with dataframe object.
|
||||
|
||||
Args:
|
||||
data_frame: Pandas DataFrame object.
|
||||
page_content_column: Name of the column containing the page content.
|
||||
Defaults to "text".
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
if not isinstance(data_frame, pd.DataFrame):
|
||||
|
@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DiffbotLoader(BaseLoader):
|
||||
"""Loader that loads Diffbot file json."""
|
||||
"""Loads Diffbot file json."""
|
||||
|
||||
def __init__(
|
||||
self, api_token: str, urls: List[str], continue_on_failure: bool = True
|
||||
):
|
||||
"""Initialize with API token, ids, and key."""
|
||||
"""Initialize with API token, ids, and key.
|
||||
|
||||
Args:
|
||||
api_token: Diffbot API token.
|
||||
urls: List of URLs to load.
|
||||
continue_on_failure: Whether to continue loading other URLs if one fails.
|
||||
Defaults to True.
|
||||
"""
|
||||
self.api_token = api_token
|
||||
self.urls = urls
|
||||
self.continue_on_failure = continue_on_failure
|
||||
@ -38,7 +45,7 @@ class DiffbotLoader(BaseLoader):
|
||||
return response.json() if response.ok else {}
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Extract text from Diffbot on all the URLs and return Document instances"""
|
||||
"""Extract text from Diffbot on all the URLs and return Documents"""
|
||||
docs: List[Document] = list()
|
||||
|
||||
for url in self.urls:
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loading logic for loading documents from a directory."""
|
||||
"""Load documents from a directory."""
|
||||
import concurrent
|
||||
import logging
|
||||
from pathlib import Path
|
||||
@ -25,7 +25,7 @@ def _is_visible(p: Path) -> bool:
|
||||
|
||||
|
||||
class DirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from a directory."""
|
||||
"""Load documents from a directory."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -40,7 +40,22 @@ class DirectoryLoader(BaseLoader):
|
||||
use_multithreading: bool = False,
|
||||
max_concurrency: int = 4,
|
||||
):
|
||||
"""Initialize with path to directory and how to glob over it."""
|
||||
"""Initialize with a path to directory and how to glob over it.
|
||||
|
||||
Args:
|
||||
path: Path to directory.
|
||||
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
|
||||
(all files except hidden).
|
||||
silent_errors: Whether to silently ignore errors. Defaults to False.
|
||||
load_hidden: Whether to load hidden files. Defaults to False.
|
||||
loader_cls: Loader class to use for loading files.
|
||||
Defaults to UnstructuredFileLoader.
|
||||
loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None.
|
||||
recursive: Whether to recursively search for files. Defaults to False.
|
||||
show_progress: Whether to show a progress bar. Defaults to False.
|
||||
use_multithreading: Whether to use multithreading. Defaults to False.
|
||||
max_concurrency: The maximum number of threads to use. Defaults to 4.
|
||||
"""
|
||||
if loader_kwargs is None:
|
||||
loader_kwargs = {}
|
||||
self.path = path
|
||||
@ -57,6 +72,14 @@ class DirectoryLoader(BaseLoader):
|
||||
def load_file(
|
||||
self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
|
||||
) -> None:
|
||||
"""Load a file.
|
||||
|
||||
Args:
|
||||
item: File path.
|
||||
path: Directory path.
|
||||
docs: List of documents to append to.
|
||||
pbar: Progress bar. Defaults to None.
|
||||
"""
|
||||
if item.is_file():
|
||||
if _is_visible(item.relative_to(path)) or self.load_hidden:
|
||||
try:
|
||||
|
@ -14,7 +14,12 @@ class DiscordChatLoader(BaseLoader):
|
||||
"""Load Discord chat logs."""
|
||||
|
||||
def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"):
|
||||
"""Initialize with a Pandas DataFrame containing chat logs."""
|
||||
"""Initialize with a Pandas DataFrame containing chat logs.
|
||||
|
||||
Args:
|
||||
chat_log: Pandas DataFrame containing chat logs.
|
||||
user_id_col: Name of the column containing the user ID. Defaults to "ID".
|
||||
"""
|
||||
if not isinstance(chat_log, pd.DataFrame):
|
||||
raise ValueError(
|
||||
f"Expected chat_log to be a pd.DataFrame, got {type(chat_log)}"
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads processed documents from Docugami."""
|
||||
"""Loads processed documents from Docugami."""
|
||||
|
||||
import io
|
||||
import logging
|
||||
@ -29,22 +29,35 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocugamiLoader(BaseLoader, BaseModel):
|
||||
"""Loader that loads processed docs from Docugami.
|
||||
"""Loads processed docs from Docugami.
|
||||
|
||||
To use, you should have the ``lxml`` python package installed.
|
||||
"""
|
||||
|
||||
api: str = DEFAULT_API_ENDPOINT
|
||||
"""The Docugami API endpoint to use."""
|
||||
|
||||
access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY")
|
||||
"""The Docugami API access token to use."""
|
||||
docset_id: Optional[str]
|
||||
"""The Docugami API docset ID to use."""
|
||||
document_ids: Optional[Sequence[str]]
|
||||
"""The Docugami API document IDs to use."""
|
||||
file_paths: Optional[Sequence[Union[Path, str]]]
|
||||
"""The local file paths to use."""
|
||||
min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking
|
||||
"""The minimum chunk size to use when parsing DGML. Defaults to 32."""
|
||||
|
||||
@root_validator
|
||||
def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""Validate that either local file paths are given, or remote API docset ID."""
|
||||
"""Validate that either local file paths are given, or remote API docset ID.
|
||||
|
||||
Args:
|
||||
values: The values to validate.
|
||||
|
||||
Returns:
|
||||
The validated values.
|
||||
"""
|
||||
if values.get("file_paths") and values.get("docset_id"):
|
||||
raise ValueError("Cannot specify both file_paths and remote API docset_id")
|
||||
|
||||
|
@ -22,6 +22,20 @@ class DuckDBLoader(BaseLoader):
|
||||
page_content_columns: Optional[List[str]] = None,
|
||||
metadata_columns: Optional[List[str]] = None,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
query: The query to execute.
|
||||
database: The database to connect to. Defaults to ":memory:".
|
||||
read_only: Whether to open the database in read-only mode.
|
||||
Defaults to False.
|
||||
config: A dictionary of configuration options to pass to the database.
|
||||
Optional.
|
||||
page_content_columns: The columns to write into the `page_content`
|
||||
of the document. Optional.
|
||||
metadata_columns: The columns to write into the `metadata` of the document.
|
||||
Optional.
|
||||
"""
|
||||
self.query = query
|
||||
self.database = database
|
||||
self.read_only = read_only
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads email files."""
|
||||
"""Loads email files."""
|
||||
import os
|
||||
from typing import Any, List
|
||||
|
||||
@ -72,12 +72,17 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||
|
||||
class OutlookMessageLoader(BaseLoader):
|
||||
"""
|
||||
Loader that loads Outlook Message files using extract_msg.
|
||||
Loads Outlook Message files using extract_msg.
|
||||
|
||||
https://github.com/TeamMsgExtractor/msg-extractor
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: The path to the Outlook Message file.
|
||||
"""
|
||||
|
||||
self.file_path = file_path
|
||||
|
||||
|
@ -52,7 +52,10 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):
|
||||
|
||||
|
||||
class BaseEmbaasLoader(BaseModel):
|
||||
"""Base class for embedding a model into an Embaas document extraction API."""
|
||||
|
||||
embaas_api_key: Optional[str] = None
|
||||
"""The API key for the embaas document extraction API."""
|
||||
api_url: str = EMBAAS_DOC_API_URL
|
||||
"""The URL of the embaas document extraction API."""
|
||||
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
|
||||
@ -69,7 +72,7 @@ class BaseEmbaasLoader(BaseModel):
|
||||
|
||||
|
||||
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
||||
"""Wrapper around embaas's document byte loader service.
|
||||
"""Embaas's document byte loader.
|
||||
|
||||
To use, you should have the
|
||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
||||
@ -99,6 +102,11 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
||||
"""
|
||||
|
||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||
"""Parses the blob lazily.
|
||||
|
||||
Args:
|
||||
blob: The blob to parse.
|
||||
"""
|
||||
yield from self._get_documents(blob=blob)
|
||||
|
||||
@staticmethod
|
||||
@ -170,7 +178,7 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
||||
|
||||
|
||||
class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
|
||||
"""Wrapper around embaas's document loader service.
|
||||
"""Embaas's document loader.
|
||||
|
||||
To use, you should have the
|
||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
||||
|
@ -14,6 +14,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
class EverNoteLoader(BaseLoader):
|
||||
"""EverNote Loader.
|
||||
|
||||
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
|
||||
Instructions on producing this file can be found at
|
||||
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
|
||||
|
@ -13,6 +13,14 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
|
||||
def __init__(
|
||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
file_path: The path to the Microsoft Excel file.
|
||||
mode: The mode to use when partitioning the file. See unstructured docs
|
||||
for more info. Optional. Defaults to "single".
|
||||
**unstructured_kwargs: Keyword arguments to pass to unstructured.
|
||||
"""
|
||||
validate_unstructured_version(min_unstructured_version="0.6.7")
|
||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||
|
||||
|
@ -9,7 +9,11 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def concatenate_rows(row: dict) -> str:
|
||||
"""Combine message information in a readable format ready to be used."""
|
||||
"""Combine message information in a readable format ready to be used.
|
||||
|
||||
Args:
|
||||
row: dictionary containing message information.
|
||||
"""
|
||||
sender = row["sender_name"]
|
||||
text = row["content"]
|
||||
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
|
||||
@ -19,10 +23,10 @@ def concatenate_rows(row: dict) -> str:
|
||||
|
||||
|
||||
class FacebookChatLoader(BaseLoader):
|
||||
"""Loader that loads Facebook messages json directory dump."""
|
||||
"""Loads Facebook messages json directory dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
"""Initialize with a path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
|
@ -9,10 +9,16 @@ from langchain.utils import stringify_dict
|
||||
|
||||
|
||||
class FigmaFileLoader(BaseLoader):
|
||||
"""Loader that loads Figma file json."""
|
||||
"""Loads Figma file json."""
|
||||
|
||||
def __init__(self, access_token: str, ids: str, key: str):
|
||||
"""Initialize with access token, ids, and key."""
|
||||
"""Initialize with access token, ids, and key.
|
||||
|
||||
Args:
|
||||
access_token: The access token for the Figma REST API.
|
||||
ids: The ids of the Figma file.
|
||||
key: The key for the Figma file
|
||||
"""
|
||||
self.access_token = access_token
|
||||
self.ids = ids
|
||||
self.key = key
|
||||
|
@ -7,10 +7,16 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||
|
||||
|
||||
class GCSDirectoryLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from GCS."""
|
||||
"""Loads Documents from GCS."""
|
||||
|
||||
def __init__(self, project_name: str, bucket: str, prefix: str = ""):
|
||||
"""Initialize with bucket and key name."""
|
||||
"""Initialize with bucket and key name.
|
||||
|
||||
Args:
|
||||
project_name: The name of the project for the GCS bucket.
|
||||
bucket: The name of the GCS bucket.
|
||||
prefix: The prefix of the GCS bucket.
|
||||
"""
|
||||
self.project_name = project_name
|
||||
self.bucket = bucket
|
||||
self.prefix = prefix
|
||||
@ -20,7 +26,7 @@ class GCSDirectoryLoader(BaseLoader):
|
||||
try:
|
||||
from google.cloud import storage
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import google-cloud-storage python package. "
|
||||
"Please install it with `pip install google-cloud-storage`."
|
||||
)
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loading logic for loading documents from a GCS file."""
|
||||
"""Load documents from a GCS file."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import List
|
||||
@ -9,10 +9,16 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class GCSFileLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from GCS."""
|
||||
"""Load Documents from a GCS file."""
|
||||
|
||||
def __init__(self, project_name: str, bucket: str, blob: str):
|
||||
"""Initialize with bucket and key name."""
|
||||
"""Initialize with bucket and key name.
|
||||
|
||||
Args:
|
||||
project_name: The name of the project to load
|
||||
bucket: The name of the GCS bucket.
|
||||
blob: The name of the GCS blob to load.
|
||||
"""
|
||||
self.bucket = bucket
|
||||
self.blob = blob
|
||||
self.project_name = project_name
|
||||
@ -22,7 +28,7 @@ class GCSFileLoader(BaseLoader):
|
||||
try:
|
||||
from google.cloud import storage
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import google-cloud-storage python package. "
|
||||
"Please install it with `pip install google-cloud-storage`."
|
||||
)
|
||||
|
@ -7,9 +7,9 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
class GitLoader(BaseLoader):
|
||||
"""Loads files from a Git repository into a list of documents.
|
||||
Repository can be local on disk available at `repo_path`,
|
||||
The Repository can be local on disk available at `repo_path`,
|
||||
or remote at `clone_url` that will be cloned to `repo_path`.
|
||||
Currently supports only text files.
|
||||
Currently, supports only text files.
|
||||
|
||||
Each document represents one file in the repository. The `path` points to
|
||||
the local Git repository, and the `branch` specifies the branch to load
|
||||
@ -23,6 +23,15 @@ class GitLoader(BaseLoader):
|
||||
branch: Optional[str] = "main",
|
||||
file_filter: Optional[Callable[[str], bool]] = None,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
repo_path: The path to the Git repository.
|
||||
clone_url: Optional. The URL to clone the repository from.
|
||||
branch: Optional. The branch to load files from. Defaults to `main`.
|
||||
file_filter: Optional. A function that takes a file path and returns
|
||||
a boolean indicating whether to load the file. Defaults to None.
|
||||
"""
|
||||
self.repo_path = repo_path
|
||||
self.clone_url = clone_url
|
||||
self.branch = branch
|
||||
|
@ -28,7 +28,9 @@ class GitbookLoader(WebBaseLoader):
|
||||
load_all_paths: If set to True, all relative paths in the navbar
|
||||
are loaded instead of only `web_page`.
|
||||
base_url: If `load_all_paths` is True, the relative paths are
|
||||
appended to this base url. Defaults to `web_page` if not set.
|
||||
appended to this base url. Defaults to `web_page`.
|
||||
content_selector: The CSS selector for the content to load.
|
||||
Defaults to "main".
|
||||
"""
|
||||
self.base_url = base_url or web_page
|
||||
if self.base_url.endswith("/"):
|
||||
|
@ -35,6 +35,8 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
|
||||
|
||||
|
||||
class GitHubIssuesLoader(BaseGitHubLoader):
|
||||
"""Load issues of a GitHub repository."""
|
||||
|
||||
include_prs: bool = True
|
||||
"""If True include Pull Requests in results, otherwise ignore them."""
|
||||
milestone: Union[int, Literal["*", "none"], None] = None
|
||||
@ -159,6 +161,7 @@ class GitHubIssuesLoader(BaseGitHubLoader):
|
||||
|
||||
@property
|
||||
def query_params(self) -> str:
|
||||
"""Create query parameters for GitHub API."""
|
||||
labels = ",".join(self.labels) if self.labels else self.labels
|
||||
query_params_dict = {
|
||||
"milestone": self.milestone,
|
||||
@ -179,4 +182,5 @@ class GitHubIssuesLoader(BaseGitHubLoader):
|
||||
|
||||
@property
|
||||
def url(self) -> str:
|
||||
"""Create URL for GitHub API."""
|
||||
return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}"
|
||||
|
@ -22,21 +22,32 @@ SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
|
||||
|
||||
|
||||
class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||
"""Loader that loads Google Docs from Google Drive."""
|
||||
"""Loads Google Docs from Google Drive."""
|
||||
|
||||
service_account_key: Path = Path.home() / ".credentials" / "keys.json"
|
||||
"""Path to the service account key file."""
|
||||
credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
|
||||
"""Path to the credentials file."""
|
||||
token_path: Path = Path.home() / ".credentials" / "token.json"
|
||||
"""Path to the token file."""
|
||||
folder_id: Optional[str] = None
|
||||
"""The folder id to load from."""
|
||||
document_ids: Optional[List[str]] = None
|
||||
"""The document ids to load from."""
|
||||
file_ids: Optional[List[str]] = None
|
||||
"""The file ids to load from."""
|
||||
recursive: bool = False
|
||||
"""Whether to load recursively. Only applies when folder_id is given."""
|
||||
file_types: Optional[Sequence[str]] = None
|
||||
"""The file types to load. Only applies when folder_id is given."""
|
||||
load_trashed_files: bool = False
|
||||
"""Whether to load trashed files. Only applies when folder_id is given."""
|
||||
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
|
||||
# results in pydantic validation errors
|
||||
file_loader_cls: Any = None
|
||||
"""The file loader class to use."""
|
||||
file_loader_kwargs: Dict["str", Any] = {}
|
||||
"""The file loader kwargs to use."""
|
||||
|
||||
@root_validator
|
||||
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads .txt web files."""
|
||||
"""Loads .txt web files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -9,7 +9,7 @@ class GutenbergLoader(BaseLoader):
|
||||
"""Loader that uses urllib to load .txt web files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path."""
|
||||
if not file_path.startswith("https://www.gutenberg.org"):
|
||||
raise ValueError("file path must start with 'https://www.gutenberg.org'")
|
||||
|
||||
|
@ -5,9 +5,14 @@ from typing import List, NamedTuple, Optional, cast
|
||||
|
||||
|
||||
class FileEncoding(NamedTuple):
|
||||
"""A file encoding as the NamedTuple."""
|
||||
|
||||
encoding: Optional[str]
|
||||
"""The encoding of the file."""
|
||||
confidence: float
|
||||
"""The confidence of the encoding."""
|
||||
language: Optional[str]
|
||||
"""The language of the file."""
|
||||
|
||||
|
||||
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
|
||||
@ -15,6 +20,10 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
|
||||
|
||||
Returns a list of `FileEncoding` tuples with the detected encodings ordered
|
||||
by confidence.
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to detect the encoding for.
|
||||
timeout: The timeout in seconds for the encoding detection.
|
||||
"""
|
||||
import chardet
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads HN."""
|
||||
"""Loader that loads Hacker News."""
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -11,7 +11,7 @@ class HNLoader(WebBaseLoader):
|
||||
def load(self) -> List[Document]:
|
||||
"""Get important HN webpage information.
|
||||
|
||||
Components are:
|
||||
HN webpage components are:
|
||||
- title
|
||||
- content
|
||||
- source url,
|
||||
|
@ -20,11 +20,18 @@ class BSHTMLLoader(BaseLoader):
|
||||
get_text_separator: str = "",
|
||||
) -> None:
|
||||
"""Initialise with path, and optionally, file encoding to use, and any kwargs
|
||||
to pass to the BeautifulSoup object."""
|
||||
to pass to the BeautifulSoup object.
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to load.
|
||||
open_encoding: The encoding to use when opening the file.
|
||||
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
|
||||
get_text_separator: The separator to use when calling get_text on the soup.
|
||||
"""
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"beautifulsoup4 package not found, please install it with "
|
||||
"`pip install beautifulsoup4`"
|
||||
)
|
||||
@ -37,9 +44,9 @@ class BSHTMLLoader(BaseLoader):
|
||||
self.get_text_separator = get_text_separator
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load HTML document into document objects."""
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
"""Load HTML document into document objects."""
|
||||
with open(self.file_path, "r", encoding=self.open_encoding) as f:
|
||||
soup = BeautifulSoup(f, **self.bs_kwargs)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads HuggingFace datasets."""
|
||||
"""Loads HuggingFace datasets."""
|
||||
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class HuggingFaceDatasetLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from the Hugging Face Hub."""
|
||||
"""Load Documents from the Hugging Face Hub."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -27,14 +27,15 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
||||
|
||||
Args:
|
||||
path: Path or name of the dataset.
|
||||
page_content_column: Page content column name.
|
||||
page_content_column: Page content column name. Default is "text".
|
||||
name: Name of the dataset configuration.
|
||||
data_dir: Data directory of the dataset configuration.
|
||||
data_files: Path(s) to source data file(s).
|
||||
cache_dir: Directory to read/write data.
|
||||
keep_in_memory: Whether to copy the dataset in-memory.
|
||||
save_infos: Save the dataset information (checksums/size/splits/...).
|
||||
use_auth_token: Bearer token for remote files on the Datasets Hub.
|
||||
Default is False.
|
||||
use_auth_token: Bearer token for remote files on the Dataset Hub.
|
||||
num_proc: Number of processes.
|
||||
"""
|
||||
|
||||
|
@ -22,7 +22,7 @@ class IFixitLoader(BaseLoader):
|
||||
"""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with web path."""
|
||||
"""Initialize with a web path."""
|
||||
if not web_path.startswith("https://www.ifixit.com"):
|
||||
raise ValueError("web path must start with 'https://www.ifixit.com'")
|
||||
|
||||
@ -60,6 +60,16 @@ class IFixitLoader(BaseLoader):
|
||||
|
||||
@staticmethod
|
||||
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
|
||||
"""Load suggestions.
|
||||
|
||||
Args:
|
||||
query: A query string
|
||||
doc_type: The type of document to search for. Can be one of "all",
|
||||
"device", "guide", "teardown", "answer", "wiki".
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
res = requests.get(
|
||||
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
|
||||
)
|
||||
@ -89,6 +99,14 @@ class IFixitLoader(BaseLoader):
|
||||
def load_questions_and_answers(
|
||||
self, url_override: Optional[str] = None
|
||||
) -> List[Document]:
|
||||
"""Load a list of questions and answers.
|
||||
|
||||
Args:
|
||||
url_override: A URL to override the default URL.
|
||||
|
||||
Returns: List[Document]
|
||||
|
||||
"""
|
||||
loader = WebBaseLoader(self.web_path if url_override is None else url_override)
|
||||
soup = loader.scrape()
|
||||
|
||||
@ -125,6 +143,16 @@ class IFixitLoader(BaseLoader):
|
||||
def load_device(
|
||||
self, url_override: Optional[str] = None, include_guides: bool = True
|
||||
) -> List[Document]:
|
||||
"""Loads a device
|
||||
|
||||
Args:
|
||||
url_override: A URL to override the default URL.
|
||||
include_guides: Whether to include guides linked to from the device.
|
||||
Defaults to True.
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
documents = []
|
||||
if url_override is None:
|
||||
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
|
||||
@ -153,6 +181,14 @@ class IFixitLoader(BaseLoader):
|
||||
return documents
|
||||
|
||||
def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
|
||||
"""Load a guide
|
||||
|
||||
Args:
|
||||
url_override: A URL to override the default URL.
|
||||
|
||||
Returns: List[Document]
|
||||
|
||||
"""
|
||||
if url_override is None:
|
||||
url = IFIXIT_BASE_URL + "/guides/" + self.id
|
||||
else:
|
||||
|
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Loader that loads image captions
|
||||
"""Loads image captions.
|
||||
|
||||
By default, the loader utilizes the pre-trained BLIP image captioning model.
|
||||
https://huggingface.co/Salesforce/blip-image-captioning-base
|
||||
|
||||
@ -13,7 +13,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ImageCaptionLoader(BaseLoader):
|
||||
"""Loader that loads the captions of an image"""
|
||||
"""Loads the captions of an image"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -23,6 +23,11 @@ class ImageCaptionLoader(BaseLoader):
|
||||
):
|
||||
"""
|
||||
Initialize with a list of image paths
|
||||
|
||||
Args:
|
||||
path_images: A list of image paths.
|
||||
blip_processor: The name of the pre-trained BLIP processor.
|
||||
blip_model: The name of the pre-trained BLIP model.
|
||||
"""
|
||||
if isinstance(path_images, str):
|
||||
self.image_paths = [path_images]
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads IMSDb."""
|
||||
"""Loads IMSDb."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class IMSDbLoader(WebBaseLoader):
|
||||
"""Loader that loads IMSDb webpages."""
|
||||
"""Loads IMSDb webpages."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
|
@ -20,6 +20,12 @@ class IuguLoader(BaseLoader):
|
||||
"""Loader that fetches data from IUGU."""
|
||||
|
||||
def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
|
||||
"""Initialize the IUGU resource.
|
||||
|
||||
Args:
|
||||
resource: The name of the resource to fetch.
|
||||
api_token: The IUGU API token to use.
|
||||
"""
|
||||
self.resource = resource
|
||||
api_token = api_token or get_from_env("api_token", "IUGU_API_TOKEN")
|
||||
self.headers = {"Authorization": f"Bearer {api_token}"}
|
||||
|
@ -30,6 +30,14 @@ class JoplinLoader(BaseLoader):
|
||||
port: int = 41184,
|
||||
host: str = "localhost",
|
||||
) -> None:
|
||||
"""
|
||||
|
||||
Args:
|
||||
access_token: The access token to use.
|
||||
port: The port where the Web Clipper service is running. Default is 41184.
|
||||
host: The host where the Web Clipper service is running.
|
||||
Default is localhost.
|
||||
"""
|
||||
access_token = access_token or get_from_env(
|
||||
"access_token", "JOPLIN_ACCESS_TOKEN"
|
||||
)
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads data from JSON."""
|
||||
"""Loads data from JSON."""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
@ -8,8 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class JSONLoader(BaseLoader):
|
||||
"""Loads a JSON file and references a jq schema provided to load the text into
|
||||
documents.
|
||||
"""Loads a JSON file using a jq schema.
|
||||
|
||||
Example:
|
||||
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
|
||||
@ -101,7 +100,7 @@ class JSONLoader(BaseLoader):
|
||||
return str(content) if content is not None else ""
|
||||
|
||||
def _validate_content_key(self, data: Any) -> None:
|
||||
"""Check if content key is valid"""
|
||||
"""Check if a content key is valid"""
|
||||
sample = data.first()
|
||||
if not isinstance(sample, dict):
|
||||
raise ValueError(
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads LarkSuite (FeiShu) document json dump."""
|
||||
"""Loads LarkSuite (FeiShu) document json dump."""
|
||||
import json
|
||||
import urllib.request
|
||||
from typing import Any, Iterator, List
|
||||
@ -8,10 +8,16 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class LarkSuiteDocLoader(BaseLoader):
|
||||
"""Loader that loads LarkSuite (FeiShu) document."""
|
||||
"""Loads LarkSuite (FeiShu) document."""
|
||||
|
||||
def __init__(self, domain: str, access_token: str, document_id: str):
|
||||
"""Initialize with domain, access_token (tenant / user), and document_id."""
|
||||
"""Initialize with domain, access_token (tenant / user), and document_id.
|
||||
|
||||
Args:
|
||||
domain: The domain to load the LarkSuite.
|
||||
access_token: The access_token to use.
|
||||
document_id: The document_id to load.
|
||||
"""
|
||||
self.domain = domain
|
||||
self.access_token = access_token
|
||||
self.document_id = document_id
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader that loads Markdown files."""
|
||||
"""Loads Markdown files."""
|
||||
from typing import List
|
||||
|
||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
@ -15,7 +15,7 @@ def _dependable_mastodon_import() -> mastodon:
|
||||
try:
|
||||
import mastodon
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Mastodon.py package not found, "
|
||||
"please install it with `pip install Mastodon.py`"
|
||||
)
|
||||
@ -37,11 +37,13 @@ class MastodonTootsLoader(BaseLoader):
|
||||
|
||||
Args:
|
||||
mastodon_accounts: The list of Mastodon accounts to query.
|
||||
number_toots: How many toots to pull for each account.
|
||||
number_toots: How many toots to pull for each account. Default is 100.
|
||||
exclude_replies: Whether to exclude reply toots from the load.
|
||||
Default is False.
|
||||
access_token: An access token if toots are loaded as a Mastodon app. Can
|
||||
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
|
||||
api_base_url: A Mastodon API base URL to talk to, if not using the default.
|
||||
Default is "https://mastodon.social".
|
||||
"""
|
||||
mastodon = _dependable_mastodon_import()
|
||||
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
|
||||
|
@ -32,12 +32,17 @@ class MWDumpLoader(BaseLoader):
|
||||
"""
|
||||
|
||||
def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
|
||||
"""Initialize with file path."""
|
||||
"""Initialize with a file path.
|
||||
|
||||
Args:
|
||||
file_path: XML local file path
|
||||
encoding: Charset encoding, defaults to "utf8"
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.encoding = encoding
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from file path."""
|
||||
"""Load from a file path."""
|
||||
import mwparserfromhell
|
||||
import mwxml
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
"""Loader to load MHTML files, enriching metadata with page title."""
|
||||
"""Load MHTML files, enriching metadata with page title."""
|
||||
|
||||
import email
|
||||
import logging
|
||||
@ -21,11 +21,18 @@ class MHTMLLoader(BaseLoader):
|
||||
get_text_separator: str = "",
|
||||
) -> None:
|
||||
"""Initialise with path, and optionally, file encoding to use, and any kwargs
|
||||
to pass to the BeautifulSoup object."""
|
||||
to pass to the BeautifulSoup object.
|
||||
|
||||
Args:
|
||||
file_path: The path to the file to load.
|
||||
open_encoding: The encoding to use when opening the file.
|
||||
bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
|
||||
get_text_separator: The separator to use when getting text from the soup.
|
||||
"""
|
||||
try:
|
||||
import bs4 # noqa:F401
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"beautifulsoup4 package not found, please install it with "
|
||||
"`pip install beautifulsoup4`"
|
||||
)
|
||||
|
Loading…
Reference in New Issue
Block a user