docstrings document_loaders 2 (#6890)

updated docstring for the `document_loaders`

Maintainer responsibilities:
  - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
This commit is contained in:
Leonid Ganeline 2023-07-02 12:14:22 -07:00 committed by GitHub
parent 77ae8084a0
commit 1feac83323
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
38 changed files with 322 additions and 74 deletions

View File

@ -9,7 +9,7 @@ class CollegeConfidentialLoader(WebBaseLoader):
"""Loader that loads College Confidential webpages.""" """Loader that loads College Confidential webpages."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load webpage.""" """Load webpages as Documents."""
soup = self.scrape() soup = self.scrape()
text = soup.select_one("main[class='skin-handler']").text text = soup.select_one("main[class='skin-handler']").text
metadata = {"source": self.web_path} metadata = {"source": self.web_path}

View File

@ -33,8 +33,9 @@ class ContentFormat(str, Enum):
class ConfluenceLoader(BaseLoader): class ConfluenceLoader(BaseLoader):
""" """Load Confluence pages.
Load Confluence pages. Port of https://llamahub.ai/l/confluence
Port of https://llamahub.ai/l/confluence
This currently supports username/api_key, Oauth2 login or personal access token This currently supports username/api_key, Oauth2 login or personal access token
authentication. authentication.
@ -175,7 +176,7 @@ class ConfluenceLoader(BaseLoader):
"key_cert", "key_cert",
]: ]:
errors.append( errors.append(
"You have either ommited require keys or added extra " "You have either omitted require keys or added extra "
"keys to the oauth2 dictionary. key values should be " "keys to the oauth2 dictionary. key values should be "
"`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`" "`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`"
) )
@ -343,7 +344,7 @@ class ConfluenceLoader(BaseLoader):
doesn't match the limit value. If `limit` is >100 confluence doesn't match the limit value. If `limit` is >100 confluence
seems to cap the response to 100. Also, due to the Atlassian Python seems to cap the response to 100. Also, due to the Atlassian Python
package, we don't get the "next" values from the "_links" key because package, we don't get the "next" values from the "_links" key because
they only return the value from the results key. So here, the pagination they only return the value from the result key. So here, the pagination
starts from 0 and goes until the max_pages, getting the `limit` number starts from 0 and goes until the max_pages, getting the `limit` number
of pages with each request. We have to manually check if there of pages with each request. We have to manually check if there
are more docs based on the length of the returned list of pages, rather than are more docs based on the length of the returned list of pages, rather than

View File

@ -10,11 +10,11 @@ class CoNLLULoader(BaseLoader):
"""Load CoNLL-U files.""" """Load CoNLL-U files."""
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with file path.""" """Initialize with a file path."""
self.file_path = file_path self.file_path = file_path
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load from file path.""" """Load from a file path."""
with open(self.file_path, encoding="utf8") as f: with open(self.file_path, encoding="utf8") as f:
tsv = list(csv.reader(f, delimiter="\t")) tsv = list(csv.reader(f, delimiter="\t"))

View File

@ -37,6 +37,16 @@ class CSVLoader(BaseLoader):
csv_args: Optional[Dict] = None, csv_args: Optional[Dict] = None,
encoding: Optional[str] = None, encoding: Optional[str] = None,
): ):
"""
Args:
file_path: The path to the CSV file.
source_column: The name of the column in the CSV file to use as the source.
Optional. Defaults to None.
csv_args: A dictionary of arguments to pass to the csv.DictReader.
Optional. Defaults to None.
encoding: The encoding of the CSV file. Optional. Defaults to None.
"""
self.file_path = file_path self.file_path = file_path
self.source_column = source_column self.source_column = source_column
self.encoding = encoding self.encoding = encoding
@ -73,6 +83,14 @@ class UnstructuredCSVLoader(UnstructuredFileLoader):
def __init__( def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
): ):
"""
Args:
file_path: The path to the CSV file.
mode: The mode to use when loading the CSV file.
Optional. Defaults to "single".
**unstructured_kwargs: Keyword arguments to pass to unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.6.8") validate_unstructured_version(min_unstructured_version="0.6.8")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -1,4 +1,4 @@
"""Load from Dataframe object""" """Load from a Dataframe object"""
from typing import Any, Iterator, List from typing import Any, Iterator, List
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -6,10 +6,16 @@ from langchain.document_loaders.base import BaseLoader
class DataFrameLoader(BaseLoader): class DataFrameLoader(BaseLoader):
"""Load Pandas DataFrames.""" """Load Pandas DataFrame."""
def __init__(self, data_frame: Any, page_content_column: str = "text"): def __init__(self, data_frame: Any, page_content_column: str = "text"):
"""Initialize with dataframe object.""" """Initialize with dataframe object.
Args:
data_frame: Pandas DataFrame object.
page_content_column: Name of the column containing the page content.
Defaults to "text".
"""
import pandas as pd import pandas as pd
if not isinstance(data_frame, pd.DataFrame): if not isinstance(data_frame, pd.DataFrame):

View File

@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
class DiffbotLoader(BaseLoader): class DiffbotLoader(BaseLoader):
"""Loader that loads Diffbot file json.""" """Loads Diffbot file json."""
def __init__( def __init__(
self, api_token: str, urls: List[str], continue_on_failure: bool = True self, api_token: str, urls: List[str], continue_on_failure: bool = True
): ):
"""Initialize with API token, ids, and key.""" """Initialize with API token, ids, and key.
Args:
api_token: Diffbot API token.
urls: List of URLs to load.
continue_on_failure: Whether to continue loading other URLs if one fails.
Defaults to True.
"""
self.api_token = api_token self.api_token = api_token
self.urls = urls self.urls = urls
self.continue_on_failure = continue_on_failure self.continue_on_failure = continue_on_failure
@ -38,7 +45,7 @@ class DiffbotLoader(BaseLoader):
return response.json() if response.ok else {} return response.json() if response.ok else {}
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Extract text from Diffbot on all the URLs and return Document instances""" """Extract text from Diffbot on all the URLs and return Documents"""
docs: List[Document] = list() docs: List[Document] = list()
for url in self.urls: for url in self.urls:

View File

@ -1,4 +1,4 @@
"""Loading logic for loading documents from a directory.""" """Load documents from a directory."""
import concurrent import concurrent
import logging import logging
from pathlib import Path from pathlib import Path
@ -25,7 +25,7 @@ def _is_visible(p: Path) -> bool:
class DirectoryLoader(BaseLoader): class DirectoryLoader(BaseLoader):
"""Loading logic for loading documents from a directory.""" """Load documents from a directory."""
def __init__( def __init__(
self, self,
@ -40,7 +40,22 @@ class DirectoryLoader(BaseLoader):
use_multithreading: bool = False, use_multithreading: bool = False,
max_concurrency: int = 4, max_concurrency: int = 4,
): ):
"""Initialize with path to directory and how to glob over it.""" """Initialize with a path to directory and how to glob over it.
Args:
path: Path to directory.
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
(all files except hidden).
silent_errors: Whether to silently ignore errors. Defaults to False.
load_hidden: Whether to load hidden files. Defaults to False.
loader_cls: Loader class to use for loading files.
Defaults to UnstructuredFileLoader.
loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None.
recursive: Whether to recursively search for files. Defaults to False.
show_progress: Whether to show a progress bar. Defaults to False.
use_multithreading: Whether to use multithreading. Defaults to False.
max_concurrency: The maximum number of threads to use. Defaults to 4.
"""
if loader_kwargs is None: if loader_kwargs is None:
loader_kwargs = {} loader_kwargs = {}
self.path = path self.path = path
@ -57,6 +72,14 @@ class DirectoryLoader(BaseLoader):
def load_file( def load_file(
self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any] self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
) -> None: ) -> None:
"""Load a file.
Args:
item: File path.
path: Directory path.
docs: List of documents to append to.
pbar: Progress bar. Defaults to None.
"""
if item.is_file(): if item.is_file():
if _is_visible(item.relative_to(path)) or self.load_hidden: if _is_visible(item.relative_to(path)) or self.load_hidden:
try: try:

View File

@ -14,7 +14,12 @@ class DiscordChatLoader(BaseLoader):
"""Load Discord chat logs.""" """Load Discord chat logs."""
def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"): def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"):
"""Initialize with a Pandas DataFrame containing chat logs.""" """Initialize with a Pandas DataFrame containing chat logs.
Args:
chat_log: Pandas DataFrame containing chat logs.
user_id_col: Name of the column containing the user ID. Defaults to "ID".
"""
if not isinstance(chat_log, pd.DataFrame): if not isinstance(chat_log, pd.DataFrame):
raise ValueError( raise ValueError(
f"Expected chat_log to be a pd.DataFrame, got {type(chat_log)}" f"Expected chat_log to be a pd.DataFrame, got {type(chat_log)}"

View File

@ -1,4 +1,4 @@
"""Loader that loads processed documents from Docugami.""" """Loads processed documents from Docugami."""
import io import io
import logging import logging
@ -29,22 +29,35 @@ logger = logging.getLogger(__name__)
class DocugamiLoader(BaseLoader, BaseModel): class DocugamiLoader(BaseLoader, BaseModel):
"""Loader that loads processed docs from Docugami. """Loads processed docs from Docugami.
To use, you should have the ``lxml`` python package installed. To use, you should have the ``lxml`` python package installed.
""" """
api: str = DEFAULT_API_ENDPOINT api: str = DEFAULT_API_ENDPOINT
"""The Docugami API endpoint to use."""
access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY") access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY")
"""The Docugami API access token to use."""
docset_id: Optional[str] docset_id: Optional[str]
"""The Docugami API docset ID to use."""
document_ids: Optional[Sequence[str]] document_ids: Optional[Sequence[str]]
"""The Docugami API document IDs to use."""
file_paths: Optional[Sequence[Union[Path, str]]] file_paths: Optional[Sequence[Union[Path, str]]]
"""The local file paths to use."""
min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking
"""The minimum chunk size to use when parsing DGML. Defaults to 32."""
@root_validator @root_validator
def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]: def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]:
"""Validate that either local file paths are given, or remote API docset ID.""" """Validate that either local file paths are given, or remote API docset ID.
Args:
values: The values to validate.
Returns:
The validated values.
"""
if values.get("file_paths") and values.get("docset_id"): if values.get("file_paths") and values.get("docset_id"):
raise ValueError("Cannot specify both file_paths and remote API docset_id") raise ValueError("Cannot specify both file_paths and remote API docset_id")

View File

@ -22,6 +22,20 @@ class DuckDBLoader(BaseLoader):
page_content_columns: Optional[List[str]] = None, page_content_columns: Optional[List[str]] = None,
metadata_columns: Optional[List[str]] = None, metadata_columns: Optional[List[str]] = None,
): ):
"""
Args:
query: The query to execute.
database: The database to connect to. Defaults to ":memory:".
read_only: Whether to open the database in read-only mode.
Defaults to False.
config: A dictionary of configuration options to pass to the database.
Optional.
page_content_columns: The columns to write into the `page_content`
of the document. Optional.
metadata_columns: The columns to write into the `metadata` of the document.
Optional.
"""
self.query = query self.query = query
self.database = database self.database = database
self.read_only = read_only self.read_only = read_only

View File

@ -1,4 +1,4 @@
"""Loader that loads email files.""" """Loads email files."""
import os import os
from typing import Any, List from typing import Any, List
@ -72,12 +72,17 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
class OutlookMessageLoader(BaseLoader): class OutlookMessageLoader(BaseLoader):
""" """
Loader that loads Outlook Message files using extract_msg. Loads Outlook Message files using extract_msg.
https://github.com/TeamMsgExtractor/msg-extractor https://github.com/TeamMsgExtractor/msg-extractor
""" """
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with file path.""" """Initialize with a file path.
Args:
file_path: The path to the Outlook Message file.
"""
self.file_path = file_path self.file_path = file_path

View File

@ -52,7 +52,10 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):
class BaseEmbaasLoader(BaseModel): class BaseEmbaasLoader(BaseModel):
"""Base class for embedding a model into an Embaas document extraction API."""
embaas_api_key: Optional[str] = None embaas_api_key: Optional[str] = None
"""The API key for the embaas document extraction API."""
api_url: str = EMBAAS_DOC_API_URL api_url: str = EMBAAS_DOC_API_URL
"""The URL of the embaas document extraction API.""" """The URL of the embaas document extraction API."""
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters() params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
@ -69,7 +72,7 @@ class BaseEmbaasLoader(BaseModel):
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser): class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
"""Wrapper around embaas's document byte loader service. """Embaas's document byte loader.
To use, you should have the To use, you should have the
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
@ -99,6 +102,11 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
""" """
def lazy_parse(self, blob: Blob) -> Iterator[Document]: def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Parses the blob lazily.
Args:
blob: The blob to parse.
"""
yield from self._get_documents(blob=blob) yield from self._get_documents(blob=blob)
@staticmethod @staticmethod
@ -170,7 +178,7 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
class EmbaasLoader(BaseEmbaasLoader, BaseLoader): class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
"""Wrapper around embaas's document loader service. """Embaas's document loader.
To use, you should have the To use, you should have the
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass environment variable ``EMBAAS_API_KEY`` set with your API key, or pass

View File

@ -14,6 +14,7 @@ from langchain.document_loaders.base import BaseLoader
class EverNoteLoader(BaseLoader): class EverNoteLoader(BaseLoader):
"""EverNote Loader. """EverNote Loader.
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents. Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
Instructions on producing this file can be found at Instructions on producing this file can be found at
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML

View File

@ -13,6 +13,14 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
def __init__( def __init__(
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
): ):
"""
Args:
file_path: The path to the Microsoft Excel file.
mode: The mode to use when partitioning the file. See unstructured docs
for more info. Optional. Defaults to "single".
**unstructured_kwargs: Keyword arguments to pass to unstructured.
"""
validate_unstructured_version(min_unstructured_version="0.6.7") validate_unstructured_version(min_unstructured_version="0.6.7")
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs) super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)

View File

@ -9,7 +9,11 @@ from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row: dict) -> str: def concatenate_rows(row: dict) -> str:
"""Combine message information in a readable format ready to be used.""" """Combine message information in a readable format ready to be used.
Args:
row: dictionary containing message information.
"""
sender = row["sender_name"] sender = row["sender_name"]
text = row["content"] text = row["content"]
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime( date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
@ -19,10 +23,10 @@ def concatenate_rows(row: dict) -> str:
class FacebookChatLoader(BaseLoader): class FacebookChatLoader(BaseLoader):
"""Loader that loads Facebook messages json directory dump.""" """Loads Facebook messages json directory dump."""
def __init__(self, path: str): def __init__(self, path: str):
"""Initialize with path.""" """Initialize with a path."""
self.file_path = path self.file_path = path
def load(self) -> List[Document]: def load(self) -> List[Document]:

View File

@ -9,10 +9,16 @@ from langchain.utils import stringify_dict
class FigmaFileLoader(BaseLoader): class FigmaFileLoader(BaseLoader):
"""Loader that loads Figma file json.""" """Loads Figma file json."""
def __init__(self, access_token: str, ids: str, key: str): def __init__(self, access_token: str, ids: str, key: str):
"""Initialize with access token, ids, and key.""" """Initialize with access token, ids, and key.
Args:
access_token: The access token for the Figma REST API.
ids: The ids of the Figma file.
key: The key for the Figma file
"""
self.access_token = access_token self.access_token = access_token
self.ids = ids self.ids = ids
self.key = key self.key = key

View File

@ -7,10 +7,16 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
class GCSDirectoryLoader(BaseLoader): class GCSDirectoryLoader(BaseLoader):
"""Loading logic for loading documents from GCS.""" """Loads Documents from GCS."""
def __init__(self, project_name: str, bucket: str, prefix: str = ""): def __init__(self, project_name: str, bucket: str, prefix: str = ""):
"""Initialize with bucket and key name.""" """Initialize with bucket and key name.
Args:
project_name: The name of the project for the GCS bucket.
bucket: The name of the GCS bucket.
prefix: The prefix of the GCS bucket.
"""
self.project_name = project_name self.project_name = project_name
self.bucket = bucket self.bucket = bucket
self.prefix = prefix self.prefix = prefix
@ -20,7 +26,7 @@ class GCSDirectoryLoader(BaseLoader):
try: try:
from google.cloud import storage from google.cloud import storage
except ImportError: except ImportError:
raise ValueError( raise ImportError(
"Could not import google-cloud-storage python package. " "Could not import google-cloud-storage python package. "
"Please install it with `pip install google-cloud-storage`." "Please install it with `pip install google-cloud-storage`."
) )

View File

@ -1,4 +1,4 @@
"""Loading logic for loading documents from a GCS file.""" """Load documents from a GCS file."""
import os import os
import tempfile import tempfile
from typing import List from typing import List
@ -9,10 +9,16 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class GCSFileLoader(BaseLoader): class GCSFileLoader(BaseLoader):
"""Loading logic for loading documents from GCS.""" """Load Documents from a GCS file."""
def __init__(self, project_name: str, bucket: str, blob: str): def __init__(self, project_name: str, bucket: str, blob: str):
"""Initialize with bucket and key name.""" """Initialize with bucket and key name.
Args:
project_name: The name of the project to load
bucket: The name of the GCS bucket.
blob: The name of the GCS blob to load.
"""
self.bucket = bucket self.bucket = bucket
self.blob = blob self.blob = blob
self.project_name = project_name self.project_name = project_name
@ -22,7 +28,7 @@ class GCSFileLoader(BaseLoader):
try: try:
from google.cloud import storage from google.cloud import storage
except ImportError: except ImportError:
raise ValueError( raise ImportError(
"Could not import google-cloud-storage python package. " "Could not import google-cloud-storage python package. "
"Please install it with `pip install google-cloud-storage`." "Please install it with `pip install google-cloud-storage`."
) )

View File

@ -7,9 +7,9 @@ from langchain.document_loaders.base import BaseLoader
class GitLoader(BaseLoader): class GitLoader(BaseLoader):
"""Loads files from a Git repository into a list of documents. """Loads files from a Git repository into a list of documents.
Repository can be local on disk available at `repo_path`, The Repository can be local on disk available at `repo_path`,
or remote at `clone_url` that will be cloned to `repo_path`. or remote at `clone_url` that will be cloned to `repo_path`.
Currently supports only text files. Currently, supports only text files.
Each document represents one file in the repository. The `path` points to Each document represents one file in the repository. The `path` points to
the local Git repository, and the `branch` specifies the branch to load the local Git repository, and the `branch` specifies the branch to load
@ -23,6 +23,15 @@ class GitLoader(BaseLoader):
branch: Optional[str] = "main", branch: Optional[str] = "main",
file_filter: Optional[Callable[[str], bool]] = None, file_filter: Optional[Callable[[str], bool]] = None,
): ):
"""
Args:
repo_path: The path to the Git repository.
clone_url: Optional. The URL to clone the repository from.
branch: Optional. The branch to load files from. Defaults to `main`.
file_filter: Optional. A function that takes a file path and returns
a boolean indicating whether to load the file. Defaults to None.
"""
self.repo_path = repo_path self.repo_path = repo_path
self.clone_url = clone_url self.clone_url = clone_url
self.branch = branch self.branch = branch

View File

@ -28,7 +28,9 @@ class GitbookLoader(WebBaseLoader):
load_all_paths: If set to True, all relative paths in the navbar load_all_paths: If set to True, all relative paths in the navbar
are loaded instead of only `web_page`. are loaded instead of only `web_page`.
base_url: If `load_all_paths` is True, the relative paths are base_url: If `load_all_paths` is True, the relative paths are
appended to this base url. Defaults to `web_page` if not set. appended to this base url. Defaults to `web_page`.
content_selector: The CSS selector for the content to load.
Defaults to "main".
""" """
self.base_url = base_url or web_page self.base_url = base_url or web_page
if self.base_url.endswith("/"): if self.base_url.endswith("/"):

View File

@ -35,6 +35,8 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
class GitHubIssuesLoader(BaseGitHubLoader): class GitHubIssuesLoader(BaseGitHubLoader):
"""Load issues of a GitHub repository."""
include_prs: bool = True include_prs: bool = True
"""If True include Pull Requests in results, otherwise ignore them.""" """If True include Pull Requests in results, otherwise ignore them."""
milestone: Union[int, Literal["*", "none"], None] = None milestone: Union[int, Literal["*", "none"], None] = None
@ -159,6 +161,7 @@ class GitHubIssuesLoader(BaseGitHubLoader):
@property @property
def query_params(self) -> str: def query_params(self) -> str:
"""Create query parameters for GitHub API."""
labels = ",".join(self.labels) if self.labels else self.labels labels = ",".join(self.labels) if self.labels else self.labels
query_params_dict = { query_params_dict = {
"milestone": self.milestone, "milestone": self.milestone,
@ -179,4 +182,5 @@ class GitHubIssuesLoader(BaseGitHubLoader):
@property @property
def url(self) -> str: def url(self) -> str:
"""Create URL for GitHub API."""
return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}" return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}"

View File

@ -22,21 +22,32 @@ SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
class GoogleDriveLoader(BaseLoader, BaseModel): class GoogleDriveLoader(BaseLoader, BaseModel):
"""Loader that loads Google Docs from Google Drive.""" """Loads Google Docs from Google Drive."""
service_account_key: Path = Path.home() / ".credentials" / "keys.json" service_account_key: Path = Path.home() / ".credentials" / "keys.json"
"""Path to the service account key file."""
credentials_path: Path = Path.home() / ".credentials" / "credentials.json" credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
"""Path to the credentials file."""
token_path: Path = Path.home() / ".credentials" / "token.json" token_path: Path = Path.home() / ".credentials" / "token.json"
"""Path to the token file."""
folder_id: Optional[str] = None folder_id: Optional[str] = None
"""The folder id to load from."""
document_ids: Optional[List[str]] = None document_ids: Optional[List[str]] = None
"""The document ids to load from."""
file_ids: Optional[List[str]] = None file_ids: Optional[List[str]] = None
"""The file ids to load from."""
recursive: bool = False recursive: bool = False
"""Whether to load recursively. Only applies when folder_id is given."""
file_types: Optional[Sequence[str]] = None file_types: Optional[Sequence[str]] = None
"""The file types to load. Only applies when folder_id is given."""
load_trashed_files: bool = False load_trashed_files: bool = False
"""Whether to load trashed files. Only applies when folder_id is given."""
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently # NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
# results in pydantic validation errors # results in pydantic validation errors
file_loader_cls: Any = None file_loader_cls: Any = None
"""The file loader class to use."""
file_loader_kwargs: Dict["str", Any] = {} file_loader_kwargs: Dict["str", Any] = {}
"""The file loader kwargs to use."""
@root_validator @root_validator
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]: def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:

View File

@ -1,4 +1,4 @@
"""Loader that loads .txt web files.""" """Loads .txt web files."""
from typing import List from typing import List
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -9,7 +9,7 @@ class GutenbergLoader(BaseLoader):
"""Loader that uses urllib to load .txt web files.""" """Loader that uses urllib to load .txt web files."""
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with file path.""" """Initialize with a file path."""
if not file_path.startswith("https://www.gutenberg.org"): if not file_path.startswith("https://www.gutenberg.org"):
raise ValueError("file path must start with 'https://www.gutenberg.org'") raise ValueError("file path must start with 'https://www.gutenberg.org'")

View File

@ -5,9 +5,14 @@ from typing import List, NamedTuple, Optional, cast
class FileEncoding(NamedTuple): class FileEncoding(NamedTuple):
"""A file encoding as the NamedTuple."""
encoding: Optional[str] encoding: Optional[str]
"""The encoding of the file."""
confidence: float confidence: float
"""The confidence of the encoding."""
language: Optional[str] language: Optional[str]
"""The language of the file."""
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]: def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
@ -15,6 +20,10 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
Returns a list of `FileEncoding` tuples with the detected encodings ordered Returns a list of `FileEncoding` tuples with the detected encodings ordered
by confidence. by confidence.
Args:
file_path: The path to the file to detect the encoding for.
timeout: The timeout in seconds for the encoding detection.
""" """
import chardet import chardet

View File

@ -1,4 +1,4 @@
"""Loader that loads HN.""" """Loader that loads Hacker News."""
from typing import Any, List from typing import Any, List
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -11,7 +11,7 @@ class HNLoader(WebBaseLoader):
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Get important HN webpage information. """Get important HN webpage information.
Components are: HN webpage components are:
- title - title
- content - content
- source url, - source url,

View File

@ -20,11 +20,18 @@ class BSHTMLLoader(BaseLoader):
get_text_separator: str = "", get_text_separator: str = "",
) -> None: ) -> None:
"""Initialise with path, and optionally, file encoding to use, and any kwargs """Initialise with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object.""" to pass to the BeautifulSoup object.
Args:
file_path: The path to the file to load.
open_encoding: The encoding to use when opening the file.
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
get_text_separator: The separator to use when calling get_text on the soup.
"""
try: try:
import bs4 # noqa:F401 import bs4 # noqa:F401
except ImportError: except ImportError:
raise ValueError( raise ImportError(
"beautifulsoup4 package not found, please install it with " "beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`" "`pip install beautifulsoup4`"
) )
@ -37,9 +44,9 @@ class BSHTMLLoader(BaseLoader):
self.get_text_separator = get_text_separator self.get_text_separator = get_text_separator
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load HTML document into document objects."""
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
"""Load HTML document into document objects."""
with open(self.file_path, "r", encoding=self.open_encoding) as f: with open(self.file_path, "r", encoding=self.open_encoding) as f:
soup = BeautifulSoup(f, **self.bs_kwargs) soup = BeautifulSoup(f, **self.bs_kwargs)

View File

@ -1,4 +1,4 @@
"""Loader that loads HuggingFace datasets.""" """Loads HuggingFace datasets."""
from typing import Iterator, List, Mapping, Optional, Sequence, Union from typing import Iterator, List, Mapping, Optional, Sequence, Union
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
class HuggingFaceDatasetLoader(BaseLoader): class HuggingFaceDatasetLoader(BaseLoader):
"""Loading logic for loading documents from the Hugging Face Hub.""" """Load Documents from the Hugging Face Hub."""
def __init__( def __init__(
self, self,
@ -27,14 +27,15 @@ class HuggingFaceDatasetLoader(BaseLoader):
Args: Args:
path: Path or name of the dataset. path: Path or name of the dataset.
page_content_column: Page content column name. page_content_column: Page content column name. Default is "text".
name: Name of the dataset configuration. name: Name of the dataset configuration.
data_dir: Data directory of the dataset configuration. data_dir: Data directory of the dataset configuration.
data_files: Path(s) to source data file(s). data_files: Path(s) to source data file(s).
cache_dir: Directory to read/write data. cache_dir: Directory to read/write data.
keep_in_memory: Whether to copy the dataset in-memory. keep_in_memory: Whether to copy the dataset in-memory.
save_infos: Save the dataset information (checksums/size/splits/...). save_infos: Save the dataset information (checksums/size/splits/...).
use_auth_token: Bearer token for remote files on the Datasets Hub. Default is False.
use_auth_token: Bearer token for remote files on the Dataset Hub.
num_proc: Number of processes. num_proc: Number of processes.
""" """

View File

@ -22,7 +22,7 @@ class IFixitLoader(BaseLoader):
""" """
def __init__(self, web_path: str): def __init__(self, web_path: str):
"""Initialize with web path.""" """Initialize with a web path."""
if not web_path.startswith("https://www.ifixit.com"): if not web_path.startswith("https://www.ifixit.com"):
raise ValueError("web path must start with 'https://www.ifixit.com'") raise ValueError("web path must start with 'https://www.ifixit.com'")
@ -60,6 +60,16 @@ class IFixitLoader(BaseLoader):
@staticmethod @staticmethod
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]: def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
"""Load suggestions.
Args:
query: A query string
doc_type: The type of document to search for. Can be one of "all",
"device", "guide", "teardown", "answer", "wiki".
Returns:
"""
res = requests.get( res = requests.get(
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
) )
@ -89,6 +99,14 @@ class IFixitLoader(BaseLoader):
def load_questions_and_answers( def load_questions_and_answers(
self, url_override: Optional[str] = None self, url_override: Optional[str] = None
) -> List[Document]: ) -> List[Document]:
"""Load a list of questions and answers.
Args:
url_override: A URL to override the default URL.
Returns: List[Document]
"""
loader = WebBaseLoader(self.web_path if url_override is None else url_override) loader = WebBaseLoader(self.web_path if url_override is None else url_override)
soup = loader.scrape() soup = loader.scrape()
@ -125,6 +143,16 @@ class IFixitLoader(BaseLoader):
def load_device( def load_device(
self, url_override: Optional[str] = None, include_guides: bool = True self, url_override: Optional[str] = None, include_guides: bool = True
) -> List[Document]: ) -> List[Document]:
"""Loads a device
Args:
url_override: A URL to override the default URL.
include_guides: Whether to include guides linked to from the device.
Defaults to True.
Returns:
"""
documents = [] documents = []
if url_override is None: if url_override is None:
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
@ -153,6 +181,14 @@ class IFixitLoader(BaseLoader):
return documents return documents
def load_guide(self, url_override: Optional[str] = None) -> List[Document]: def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
"""Load a guide
Args:
url_override: A URL to override the default URL.
Returns: List[Document]
"""
if url_override is None: if url_override is None:
url = IFIXIT_BASE_URL + "/guides/" + self.id url = IFIXIT_BASE_URL + "/guides/" + self.id
else: else:

View File

@ -1,5 +1,5 @@
""" """Loads image captions.
Loader that loads image captions
By default, the loader utilizes the pre-trained BLIP image captioning model. By default, the loader utilizes the pre-trained BLIP image captioning model.
https://huggingface.co/Salesforce/blip-image-captioning-base https://huggingface.co/Salesforce/blip-image-captioning-base
@ -13,7 +13,7 @@ from langchain.document_loaders.base import BaseLoader
class ImageCaptionLoader(BaseLoader): class ImageCaptionLoader(BaseLoader):
"""Loader that loads the captions of an image""" """Loads the captions of an image"""
def __init__( def __init__(
self, self,
@ -23,6 +23,11 @@ class ImageCaptionLoader(BaseLoader):
): ):
""" """
Initialize with a list of image paths Initialize with a list of image paths
Args:
path_images: A list of image paths.
blip_processor: The name of the pre-trained BLIP processor.
blip_model: The name of the pre-trained BLIP model.
""" """
if isinstance(path_images, str): if isinstance(path_images, str):
self.image_paths = [path_images] self.image_paths = [path_images]

View File

@ -1,4 +1,4 @@
"""Loader that loads IMSDb.""" """Loads IMSDb."""
from typing import List from typing import List
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
class IMSDbLoader(WebBaseLoader): class IMSDbLoader(WebBaseLoader):
"""Loader that loads IMSDb webpages.""" """Loads IMSDb webpages."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load webpage.""" """Load webpage."""

View File

@ -20,6 +20,12 @@ class IuguLoader(BaseLoader):
"""Loader that fetches data from IUGU.""" """Loader that fetches data from IUGU."""
def __init__(self, resource: str, api_token: Optional[str] = None) -> None: def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
"""Initialize the IUGU resource.
Args:
resource: The name of the resource to fetch.
api_token: The IUGU API token to use.
"""
self.resource = resource self.resource = resource
api_token = api_token or get_from_env("api_token", "IUGU_API_TOKEN") api_token = api_token or get_from_env("api_token", "IUGU_API_TOKEN")
self.headers = {"Authorization": f"Bearer {api_token}"} self.headers = {"Authorization": f"Bearer {api_token}"}

View File

@ -30,6 +30,14 @@ class JoplinLoader(BaseLoader):
port: int = 41184, port: int = 41184,
host: str = "localhost", host: str = "localhost",
) -> None: ) -> None:
"""
Args:
access_token: The access token to use.
port: The port where the Web Clipper service is running. Default is 41184.
host: The host where the Web Clipper service is running.
Default is localhost.
"""
access_token = access_token or get_from_env( access_token = access_token or get_from_env(
"access_token", "JOPLIN_ACCESS_TOKEN" "access_token", "JOPLIN_ACCESS_TOKEN"
) )

View File

@ -1,4 +1,4 @@
"""Loader that loads data from JSON.""" """Loads data from JSON."""
import json import json
from pathlib import Path from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, List, Optional, Union
@ -8,8 +8,7 @@ from langchain.document_loaders.base import BaseLoader
class JSONLoader(BaseLoader): class JSONLoader(BaseLoader):
"""Loads a JSON file and references a jq schema provided to load the text into """Loads a JSON file using a jq schema.
documents.
Example: Example:
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text [{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
@ -101,7 +100,7 @@ class JSONLoader(BaseLoader):
return str(content) if content is not None else "" return str(content) if content is not None else ""
def _validate_content_key(self, data: Any) -> None: def _validate_content_key(self, data: Any) -> None:
"""Check if content key is valid""" """Check if a content key is valid"""
sample = data.first() sample = data.first()
if not isinstance(sample, dict): if not isinstance(sample, dict):
raise ValueError( raise ValueError(

View File

@ -1,4 +1,4 @@
"""Loader that loads LarkSuite (FeiShu) document json dump.""" """Loads LarkSuite (FeiShu) document json dump."""
import json import json
import urllib.request import urllib.request
from typing import Any, Iterator, List from typing import Any, Iterator, List
@ -8,10 +8,16 @@ from langchain.document_loaders.base import BaseLoader
class LarkSuiteDocLoader(BaseLoader): class LarkSuiteDocLoader(BaseLoader):
"""Loader that loads LarkSuite (FeiShu) document.""" """Loads LarkSuite (FeiShu) document."""
def __init__(self, domain: str, access_token: str, document_id: str): def __init__(self, domain: str, access_token: str, document_id: str):
"""Initialize with domain, access_token (tenant / user), and document_id.""" """Initialize with domain, access_token (tenant / user), and document_id.
Args:
domain: The domain to load the LarkSuite.
access_token: The access_token to use.
document_id: The document_id to load.
"""
self.domain = domain self.domain = domain
self.access_token = access_token self.access_token = access_token
self.document_id = document_id self.document_id = document_id

View File

@ -1,4 +1,4 @@
"""Loader that loads Markdown files.""" """Loads Markdown files."""
from typing import List from typing import List
from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader

View File

@ -15,7 +15,7 @@ def _dependable_mastodon_import() -> mastodon:
try: try:
import mastodon import mastodon
except ImportError: except ImportError:
raise ValueError( raise ImportError(
"Mastodon.py package not found, " "Mastodon.py package not found, "
"please install it with `pip install Mastodon.py`" "please install it with `pip install Mastodon.py`"
) )
@ -37,11 +37,13 @@ class MastodonTootsLoader(BaseLoader):
Args: Args:
mastodon_accounts: The list of Mastodon accounts to query. mastodon_accounts: The list of Mastodon accounts to query.
number_toots: How many toots to pull for each account. number_toots: How many toots to pull for each account. Default is 100.
exclude_replies: Whether to exclude reply toots from the load. exclude_replies: Whether to exclude reply toots from the load.
Default is False.
access_token: An access token if toots are loaded as a Mastodon app. Can access_token: An access token if toots are loaded as a Mastodon app. Can
also be specified via the environment variables "MASTODON_ACCESS_TOKEN". also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
api_base_url: A Mastodon API base URL to talk to, if not using the default. api_base_url: A Mastodon API base URL to talk to, if not using the default.
Default is "https://mastodon.social".
""" """
mastodon = _dependable_mastodon_import() mastodon = _dependable_mastodon_import()
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN") access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")

View File

@ -32,12 +32,17 @@ class MWDumpLoader(BaseLoader):
""" """
def __init__(self, file_path: str, encoding: Optional[str] = "utf8"): def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
"""Initialize with file path.""" """Initialize with a file path.
Args:
file_path: XML local file path
encoding: Charset encoding, defaults to "utf8"
"""
self.file_path = file_path self.file_path = file_path
self.encoding = encoding self.encoding = encoding
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load from file path.""" """Load from a file path."""
import mwparserfromhell import mwparserfromhell
import mwxml import mwxml

View File

@ -1,4 +1,4 @@
"""Loader to load MHTML files, enriching metadata with page title.""" """Load MHTML files, enriching metadata with page title."""
import email import email
import logging import logging
@ -21,11 +21,18 @@ class MHTMLLoader(BaseLoader):
get_text_separator: str = "", get_text_separator: str = "",
) -> None: ) -> None:
"""Initialise with path, and optionally, file encoding to use, and any kwargs """Initialise with path, and optionally, file encoding to use, and any kwargs
to pass to the BeautifulSoup object.""" to pass to the BeautifulSoup object.
Args:
file_path: The path to the file to load.
open_encoding: The encoding to use when opening the file.
bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
get_text_separator: The separator to use when getting text from the soup.
"""
try: try:
import bs4 # noqa:F401 import bs4 # noqa:F401
except ImportError: except ImportError:
raise ValueError( raise ImportError(
"beautifulsoup4 package not found, please install it with " "beautifulsoup4 package not found, please install it with "
"`pip install beautifulsoup4`" "`pip install beautifulsoup4`"
) )