mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 16:13:25 +00:00
docstrings document_loaders
2 (#6890)
updated docstring for the `document_loaders` Maintainer responsibilities: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
This commit is contained in:
parent
77ae8084a0
commit
1feac83323
@ -9,7 +9,7 @@ class CollegeConfidentialLoader(WebBaseLoader):
|
|||||||
"""Loader that loads College Confidential webpages."""
|
"""Loader that loads College Confidential webpages."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load webpage."""
|
"""Load webpages as Documents."""
|
||||||
soup = self.scrape()
|
soup = self.scrape()
|
||||||
text = soup.select_one("main[class='skin-handler']").text
|
text = soup.select_one("main[class='skin-handler']").text
|
||||||
metadata = {"source": self.web_path}
|
metadata = {"source": self.web_path}
|
||||||
|
@ -33,8 +33,9 @@ class ContentFormat(str, Enum):
|
|||||||
|
|
||||||
|
|
||||||
class ConfluenceLoader(BaseLoader):
|
class ConfluenceLoader(BaseLoader):
|
||||||
"""
|
"""Load Confluence pages.
|
||||||
Load Confluence pages. Port of https://llamahub.ai/l/confluence
|
|
||||||
|
Port of https://llamahub.ai/l/confluence
|
||||||
This currently supports username/api_key, Oauth2 login or personal access token
|
This currently supports username/api_key, Oauth2 login or personal access token
|
||||||
authentication.
|
authentication.
|
||||||
|
|
||||||
@ -175,7 +176,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
"key_cert",
|
"key_cert",
|
||||||
]:
|
]:
|
||||||
errors.append(
|
errors.append(
|
||||||
"You have either ommited require keys or added extra "
|
"You have either omitted require keys or added extra "
|
||||||
"keys to the oauth2 dictionary. key values should be "
|
"keys to the oauth2 dictionary. key values should be "
|
||||||
"`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`"
|
"`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`"
|
||||||
)
|
)
|
||||||
@ -340,10 +341,10 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
"""Paginate the various methods to retrieve groups of pages.
|
"""Paginate the various methods to retrieve groups of pages.
|
||||||
|
|
||||||
Unfortunately, due to page size, sometimes the Confluence API
|
Unfortunately, due to page size, sometimes the Confluence API
|
||||||
doesn't match the limit value. If `limit` is >100 confluence
|
doesn't match the limit value. If `limit` is >100 confluence
|
||||||
seems to cap the response to 100. Also, due to the Atlassian Python
|
seems to cap the response to 100. Also, due to the Atlassian Python
|
||||||
package, we don't get the "next" values from the "_links" key because
|
package, we don't get the "next" values from the "_links" key because
|
||||||
they only return the value from the results key. So here, the pagination
|
they only return the value from the result key. So here, the pagination
|
||||||
starts from 0 and goes until the max_pages, getting the `limit` number
|
starts from 0 and goes until the max_pages, getting the `limit` number
|
||||||
of pages with each request. We have to manually check if there
|
of pages with each request. We have to manually check if there
|
||||||
are more docs based on the length of the returned list of pages, rather than
|
are more docs based on the length of the returned list of pages, rather than
|
||||||
|
@ -10,11 +10,11 @@ class CoNLLULoader(BaseLoader):
|
|||||||
"""Load CoNLL-U files."""
|
"""Load CoNLL-U files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load from file path."""
|
"""Load from a file path."""
|
||||||
with open(self.file_path, encoding="utf8") as f:
|
with open(self.file_path, encoding="utf8") as f:
|
||||||
tsv = list(csv.reader(f, delimiter="\t"))
|
tsv = list(csv.reader(f, delimiter="\t"))
|
||||||
|
|
||||||
|
@ -37,6 +37,16 @@ class CSVLoader(BaseLoader):
|
|||||||
csv_args: Optional[Dict] = None,
|
csv_args: Optional[Dict] = None,
|
||||||
encoding: Optional[str] = None,
|
encoding: Optional[str] = None,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the CSV file.
|
||||||
|
source_column: The name of the column in the CSV file to use as the source.
|
||||||
|
Optional. Defaults to None.
|
||||||
|
csv_args: A dictionary of arguments to pass to the csv.DictReader.
|
||||||
|
Optional. Defaults to None.
|
||||||
|
encoding: The encoding of the CSV file. Optional. Defaults to None.
|
||||||
|
"""
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.source_column = source_column
|
self.source_column = source_column
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
@ -73,6 +83,14 @@ class UnstructuredCSVLoader(UnstructuredFileLoader):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the CSV file.
|
||||||
|
mode: The mode to use when loading the CSV file.
|
||||||
|
Optional. Defaults to "single".
|
||||||
|
**unstructured_kwargs: Keyword arguments to pass to unstructured.
|
||||||
|
"""
|
||||||
validate_unstructured_version(min_unstructured_version="0.6.8")
|
validate_unstructured_version(min_unstructured_version="0.6.8")
|
||||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Load from Dataframe object"""
|
"""Load from a Dataframe object"""
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,10 +6,16 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class DataFrameLoader(BaseLoader):
|
class DataFrameLoader(BaseLoader):
|
||||||
"""Load Pandas DataFrames."""
|
"""Load Pandas DataFrame."""
|
||||||
|
|
||||||
def __init__(self, data_frame: Any, page_content_column: str = "text"):
|
def __init__(self, data_frame: Any, page_content_column: str = "text"):
|
||||||
"""Initialize with dataframe object."""
|
"""Initialize with dataframe object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data_frame: Pandas DataFrame object.
|
||||||
|
page_content_column: Name of the column containing the page content.
|
||||||
|
Defaults to "text".
|
||||||
|
"""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
if not isinstance(data_frame, pd.DataFrame):
|
if not isinstance(data_frame, pd.DataFrame):
|
||||||
|
@ -11,12 +11,19 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class DiffbotLoader(BaseLoader):
|
class DiffbotLoader(BaseLoader):
|
||||||
"""Loader that loads Diffbot file json."""
|
"""Loads Diffbot file json."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, api_token: str, urls: List[str], continue_on_failure: bool = True
|
self, api_token: str, urls: List[str], continue_on_failure: bool = True
|
||||||
):
|
):
|
||||||
"""Initialize with API token, ids, and key."""
|
"""Initialize with API token, ids, and key.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_token: Diffbot API token.
|
||||||
|
urls: List of URLs to load.
|
||||||
|
continue_on_failure: Whether to continue loading other URLs if one fails.
|
||||||
|
Defaults to True.
|
||||||
|
"""
|
||||||
self.api_token = api_token
|
self.api_token = api_token
|
||||||
self.urls = urls
|
self.urls = urls
|
||||||
self.continue_on_failure = continue_on_failure
|
self.continue_on_failure = continue_on_failure
|
||||||
@ -38,7 +45,7 @@ class DiffbotLoader(BaseLoader):
|
|||||||
return response.json() if response.ok else {}
|
return response.json() if response.ok else {}
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Extract text from Diffbot on all the URLs and return Document instances"""
|
"""Extract text from Diffbot on all the URLs and return Documents"""
|
||||||
docs: List[Document] = list()
|
docs: List[Document] = list()
|
||||||
|
|
||||||
for url in self.urls:
|
for url in self.urls:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loading logic for loading documents from a directory."""
|
"""Load documents from a directory."""
|
||||||
import concurrent
|
import concurrent
|
||||||
import logging
|
import logging
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -25,7 +25,7 @@ def _is_visible(p: Path) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
class DirectoryLoader(BaseLoader):
|
class DirectoryLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from a directory."""
|
"""Load documents from a directory."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -40,7 +40,22 @@ class DirectoryLoader(BaseLoader):
|
|||||||
use_multithreading: bool = False,
|
use_multithreading: bool = False,
|
||||||
max_concurrency: int = 4,
|
max_concurrency: int = 4,
|
||||||
):
|
):
|
||||||
"""Initialize with path to directory and how to glob over it."""
|
"""Initialize with a path to directory and how to glob over it.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to directory.
|
||||||
|
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
|
||||||
|
(all files except hidden).
|
||||||
|
silent_errors: Whether to silently ignore errors. Defaults to False.
|
||||||
|
load_hidden: Whether to load hidden files. Defaults to False.
|
||||||
|
loader_cls: Loader class to use for loading files.
|
||||||
|
Defaults to UnstructuredFileLoader.
|
||||||
|
loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None.
|
||||||
|
recursive: Whether to recursively search for files. Defaults to False.
|
||||||
|
show_progress: Whether to show a progress bar. Defaults to False.
|
||||||
|
use_multithreading: Whether to use multithreading. Defaults to False.
|
||||||
|
max_concurrency: The maximum number of threads to use. Defaults to 4.
|
||||||
|
"""
|
||||||
if loader_kwargs is None:
|
if loader_kwargs is None:
|
||||||
loader_kwargs = {}
|
loader_kwargs = {}
|
||||||
self.path = path
|
self.path = path
|
||||||
@ -57,6 +72,14 @@ class DirectoryLoader(BaseLoader):
|
|||||||
def load_file(
|
def load_file(
|
||||||
self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
|
self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Load a file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
item: File path.
|
||||||
|
path: Directory path.
|
||||||
|
docs: List of documents to append to.
|
||||||
|
pbar: Progress bar. Defaults to None.
|
||||||
|
"""
|
||||||
if item.is_file():
|
if item.is_file():
|
||||||
if _is_visible(item.relative_to(path)) or self.load_hidden:
|
if _is_visible(item.relative_to(path)) or self.load_hidden:
|
||||||
try:
|
try:
|
||||||
|
@ -14,7 +14,12 @@ class DiscordChatLoader(BaseLoader):
|
|||||||
"""Load Discord chat logs."""
|
"""Load Discord chat logs."""
|
||||||
|
|
||||||
def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"):
|
def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"):
|
||||||
"""Initialize with a Pandas DataFrame containing chat logs."""
|
"""Initialize with a Pandas DataFrame containing chat logs.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chat_log: Pandas DataFrame containing chat logs.
|
||||||
|
user_id_col: Name of the column containing the user ID. Defaults to "ID".
|
||||||
|
"""
|
||||||
if not isinstance(chat_log, pd.DataFrame):
|
if not isinstance(chat_log, pd.DataFrame):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Expected chat_log to be a pd.DataFrame, got {type(chat_log)}"
|
f"Expected chat_log to be a pd.DataFrame, got {type(chat_log)}"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads processed documents from Docugami."""
|
"""Loads processed documents from Docugami."""
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
@ -29,22 +29,35 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class DocugamiLoader(BaseLoader, BaseModel):
|
class DocugamiLoader(BaseLoader, BaseModel):
|
||||||
"""Loader that loads processed docs from Docugami.
|
"""Loads processed docs from Docugami.
|
||||||
|
|
||||||
To use, you should have the ``lxml`` python package installed.
|
To use, you should have the ``lxml`` python package installed.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
api: str = DEFAULT_API_ENDPOINT
|
api: str = DEFAULT_API_ENDPOINT
|
||||||
|
"""The Docugami API endpoint to use."""
|
||||||
|
|
||||||
access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY")
|
access_token: Optional[str] = os.environ.get("DOCUGAMI_API_KEY")
|
||||||
|
"""The Docugami API access token to use."""
|
||||||
docset_id: Optional[str]
|
docset_id: Optional[str]
|
||||||
|
"""The Docugami API docset ID to use."""
|
||||||
document_ids: Optional[Sequence[str]]
|
document_ids: Optional[Sequence[str]]
|
||||||
|
"""The Docugami API document IDs to use."""
|
||||||
file_paths: Optional[Sequence[Union[Path, str]]]
|
file_paths: Optional[Sequence[Union[Path, str]]]
|
||||||
|
"""The local file paths to use."""
|
||||||
min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking
|
min_chunk_size: int = 32 # appended to the next chunk to avoid over-chunking
|
||||||
|
"""The minimum chunk size to use when parsing DGML. Defaults to 32."""
|
||||||
|
|
||||||
@root_validator
|
@root_validator
|
||||||
def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
def validate_local_or_remote(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
"""Validate that either local file paths are given, or remote API docset ID."""
|
"""Validate that either local file paths are given, or remote API docset ID.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
values: The values to validate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The validated values.
|
||||||
|
"""
|
||||||
if values.get("file_paths") and values.get("docset_id"):
|
if values.get("file_paths") and values.get("docset_id"):
|
||||||
raise ValueError("Cannot specify both file_paths and remote API docset_id")
|
raise ValueError("Cannot specify both file_paths and remote API docset_id")
|
||||||
|
|
||||||
|
@ -22,6 +22,20 @@ class DuckDBLoader(BaseLoader):
|
|||||||
page_content_columns: Optional[List[str]] = None,
|
page_content_columns: Optional[List[str]] = None,
|
||||||
metadata_columns: Optional[List[str]] = None,
|
metadata_columns: Optional[List[str]] = None,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: The query to execute.
|
||||||
|
database: The database to connect to. Defaults to ":memory:".
|
||||||
|
read_only: Whether to open the database in read-only mode.
|
||||||
|
Defaults to False.
|
||||||
|
config: A dictionary of configuration options to pass to the database.
|
||||||
|
Optional.
|
||||||
|
page_content_columns: The columns to write into the `page_content`
|
||||||
|
of the document. Optional.
|
||||||
|
metadata_columns: The columns to write into the `metadata` of the document.
|
||||||
|
Optional.
|
||||||
|
"""
|
||||||
self.query = query
|
self.query = query
|
||||||
self.database = database
|
self.database = database
|
||||||
self.read_only = read_only
|
self.read_only = read_only
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads email files."""
|
"""Loads email files."""
|
||||||
import os
|
import os
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
@ -72,12 +72,17 @@ class UnstructuredEmailLoader(UnstructuredFileLoader):
|
|||||||
|
|
||||||
class OutlookMessageLoader(BaseLoader):
|
class OutlookMessageLoader(BaseLoader):
|
||||||
"""
|
"""
|
||||||
Loader that loads Outlook Message files using extract_msg.
|
Loads Outlook Message files using extract_msg.
|
||||||
|
|
||||||
https://github.com/TeamMsgExtractor/msg-extractor
|
https://github.com/TeamMsgExtractor/msg-extractor
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the Outlook Message file.
|
||||||
|
"""
|
||||||
|
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
|
|
||||||
|
@ -52,7 +52,10 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):
|
|||||||
|
|
||||||
|
|
||||||
class BaseEmbaasLoader(BaseModel):
|
class BaseEmbaasLoader(BaseModel):
|
||||||
|
"""Base class for embedding a model into an Embaas document extraction API."""
|
||||||
|
|
||||||
embaas_api_key: Optional[str] = None
|
embaas_api_key: Optional[str] = None
|
||||||
|
"""The API key for the embaas document extraction API."""
|
||||||
api_url: str = EMBAAS_DOC_API_URL
|
api_url: str = EMBAAS_DOC_API_URL
|
||||||
"""The URL of the embaas document extraction API."""
|
"""The URL of the embaas document extraction API."""
|
||||||
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
|
params: EmbaasDocumentExtractionParameters = EmbaasDocumentExtractionParameters()
|
||||||
@ -69,7 +72,7 @@ class BaseEmbaasLoader(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
||||||
"""Wrapper around embaas's document byte loader service.
|
"""Embaas's document byte loader.
|
||||||
|
|
||||||
To use, you should have the
|
To use, you should have the
|
||||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
||||||
@ -99,6 +102,11 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
|
||||||
|
"""Parses the blob lazily.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
blob: The blob to parse.
|
||||||
|
"""
|
||||||
yield from self._get_documents(blob=blob)
|
yield from self._get_documents(blob=blob)
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@ -170,7 +178,7 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
|
class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
|
||||||
"""Wrapper around embaas's document loader service.
|
"""Embaas's document loader.
|
||||||
|
|
||||||
To use, you should have the
|
To use, you should have the
|
||||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
||||||
|
@ -14,6 +14,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
class EverNoteLoader(BaseLoader):
|
class EverNoteLoader(BaseLoader):
|
||||||
"""EverNote Loader.
|
"""EverNote Loader.
|
||||||
|
|
||||||
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
|
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
|
||||||
Instructions on producing this file can be found at
|
Instructions on producing this file can be found at
|
||||||
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
|
https://help.evernote.com/hc/en-us/articles/209005557-Export-notes-and-notebooks-as-ENEX-or-HTML
|
||||||
|
@ -13,6 +13,14 @@ class UnstructuredExcelLoader(UnstructuredFileLoader):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the Microsoft Excel file.
|
||||||
|
mode: The mode to use when partitioning the file. See unstructured docs
|
||||||
|
for more info. Optional. Defaults to "single".
|
||||||
|
**unstructured_kwargs: Keyword arguments to pass to unstructured.
|
||||||
|
"""
|
||||||
validate_unstructured_version(min_unstructured_version="0.6.7")
|
validate_unstructured_version(min_unstructured_version="0.6.7")
|
||||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||||
|
|
||||||
|
@ -9,7 +9,11 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
def concatenate_rows(row: dict) -> str:
|
def concatenate_rows(row: dict) -> str:
|
||||||
"""Combine message information in a readable format ready to be used."""
|
"""Combine message information in a readable format ready to be used.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
row: dictionary containing message information.
|
||||||
|
"""
|
||||||
sender = row["sender_name"]
|
sender = row["sender_name"]
|
||||||
text = row["content"]
|
text = row["content"]
|
||||||
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
|
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
|
||||||
@ -19,10 +23,10 @@ def concatenate_rows(row: dict) -> str:
|
|||||||
|
|
||||||
|
|
||||||
class FacebookChatLoader(BaseLoader):
|
class FacebookChatLoader(BaseLoader):
|
||||||
"""Loader that loads Facebook messages json directory dump."""
|
"""Loads Facebook messages json directory dump."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with path."""
|
"""Initialize with a path."""
|
||||||
self.file_path = path
|
self.file_path = path
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
|
@ -9,10 +9,16 @@ from langchain.utils import stringify_dict
|
|||||||
|
|
||||||
|
|
||||||
class FigmaFileLoader(BaseLoader):
|
class FigmaFileLoader(BaseLoader):
|
||||||
"""Loader that loads Figma file json."""
|
"""Loads Figma file json."""
|
||||||
|
|
||||||
def __init__(self, access_token: str, ids: str, key: str):
|
def __init__(self, access_token: str, ids: str, key: str):
|
||||||
"""Initialize with access token, ids, and key."""
|
"""Initialize with access token, ids, and key.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
access_token: The access token for the Figma REST API.
|
||||||
|
ids: The ids of the Figma file.
|
||||||
|
key: The key for the Figma file
|
||||||
|
"""
|
||||||
self.access_token = access_token
|
self.access_token = access_token
|
||||||
self.ids = ids
|
self.ids = ids
|
||||||
self.key = key
|
self.key = key
|
||||||
|
@ -7,10 +7,16 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class GCSDirectoryLoader(BaseLoader):
|
class GCSDirectoryLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from GCS."""
|
"""Loads Documents from GCS."""
|
||||||
|
|
||||||
def __init__(self, project_name: str, bucket: str, prefix: str = ""):
|
def __init__(self, project_name: str, bucket: str, prefix: str = ""):
|
||||||
"""Initialize with bucket and key name."""
|
"""Initialize with bucket and key name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
project_name: The name of the project for the GCS bucket.
|
||||||
|
bucket: The name of the GCS bucket.
|
||||||
|
prefix: The prefix of the GCS bucket.
|
||||||
|
"""
|
||||||
self.project_name = project_name
|
self.project_name = project_name
|
||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
@ -20,7 +26,7 @@ class GCSDirectoryLoader(BaseLoader):
|
|||||||
try:
|
try:
|
||||||
from google.cloud import storage
|
from google.cloud import storage
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ValueError(
|
raise ImportError(
|
||||||
"Could not import google-cloud-storage python package. "
|
"Could not import google-cloud-storage python package. "
|
||||||
"Please install it with `pip install google-cloud-storage`."
|
"Please install it with `pip install google-cloud-storage`."
|
||||||
)
|
)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loading logic for loading documents from a GCS file."""
|
"""Load documents from a GCS file."""
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -9,10 +9,16 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class GCSFileLoader(BaseLoader):
|
class GCSFileLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from GCS."""
|
"""Load Documents from a GCS file."""
|
||||||
|
|
||||||
def __init__(self, project_name: str, bucket: str, blob: str):
|
def __init__(self, project_name: str, bucket: str, blob: str):
|
||||||
"""Initialize with bucket and key name."""
|
"""Initialize with bucket and key name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
project_name: The name of the project to load
|
||||||
|
bucket: The name of the GCS bucket.
|
||||||
|
blob: The name of the GCS blob to load.
|
||||||
|
"""
|
||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.blob = blob
|
self.blob = blob
|
||||||
self.project_name = project_name
|
self.project_name = project_name
|
||||||
@ -22,7 +28,7 @@ class GCSFileLoader(BaseLoader):
|
|||||||
try:
|
try:
|
||||||
from google.cloud import storage
|
from google.cloud import storage
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ValueError(
|
raise ImportError(
|
||||||
"Could not import google-cloud-storage python package. "
|
"Could not import google-cloud-storage python package. "
|
||||||
"Please install it with `pip install google-cloud-storage`."
|
"Please install it with `pip install google-cloud-storage`."
|
||||||
)
|
)
|
||||||
|
@ -7,9 +7,9 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
class GitLoader(BaseLoader):
|
class GitLoader(BaseLoader):
|
||||||
"""Loads files from a Git repository into a list of documents.
|
"""Loads files from a Git repository into a list of documents.
|
||||||
Repository can be local on disk available at `repo_path`,
|
The Repository can be local on disk available at `repo_path`,
|
||||||
or remote at `clone_url` that will be cloned to `repo_path`.
|
or remote at `clone_url` that will be cloned to `repo_path`.
|
||||||
Currently supports only text files.
|
Currently, supports only text files.
|
||||||
|
|
||||||
Each document represents one file in the repository. The `path` points to
|
Each document represents one file in the repository. The `path` points to
|
||||||
the local Git repository, and the `branch` specifies the branch to load
|
the local Git repository, and the `branch` specifies the branch to load
|
||||||
@ -23,6 +23,15 @@ class GitLoader(BaseLoader):
|
|||||||
branch: Optional[str] = "main",
|
branch: Optional[str] = "main",
|
||||||
file_filter: Optional[Callable[[str], bool]] = None,
|
file_filter: Optional[Callable[[str], bool]] = None,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
repo_path: The path to the Git repository.
|
||||||
|
clone_url: Optional. The URL to clone the repository from.
|
||||||
|
branch: Optional. The branch to load files from. Defaults to `main`.
|
||||||
|
file_filter: Optional. A function that takes a file path and returns
|
||||||
|
a boolean indicating whether to load the file. Defaults to None.
|
||||||
|
"""
|
||||||
self.repo_path = repo_path
|
self.repo_path = repo_path
|
||||||
self.clone_url = clone_url
|
self.clone_url = clone_url
|
||||||
self.branch = branch
|
self.branch = branch
|
||||||
|
@ -28,7 +28,9 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
load_all_paths: If set to True, all relative paths in the navbar
|
load_all_paths: If set to True, all relative paths in the navbar
|
||||||
are loaded instead of only `web_page`.
|
are loaded instead of only `web_page`.
|
||||||
base_url: If `load_all_paths` is True, the relative paths are
|
base_url: If `load_all_paths` is True, the relative paths are
|
||||||
appended to this base url. Defaults to `web_page` if not set.
|
appended to this base url. Defaults to `web_page`.
|
||||||
|
content_selector: The CSS selector for the content to load.
|
||||||
|
Defaults to "main".
|
||||||
"""
|
"""
|
||||||
self.base_url = base_url or web_page
|
self.base_url = base_url or web_page
|
||||||
if self.base_url.endswith("/"):
|
if self.base_url.endswith("/"):
|
||||||
|
@ -35,6 +35,8 @@ class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
|
|||||||
|
|
||||||
|
|
||||||
class GitHubIssuesLoader(BaseGitHubLoader):
|
class GitHubIssuesLoader(BaseGitHubLoader):
|
||||||
|
"""Load issues of a GitHub repository."""
|
||||||
|
|
||||||
include_prs: bool = True
|
include_prs: bool = True
|
||||||
"""If True include Pull Requests in results, otherwise ignore them."""
|
"""If True include Pull Requests in results, otherwise ignore them."""
|
||||||
milestone: Union[int, Literal["*", "none"], None] = None
|
milestone: Union[int, Literal["*", "none"], None] = None
|
||||||
@ -159,6 +161,7 @@ class GitHubIssuesLoader(BaseGitHubLoader):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def query_params(self) -> str:
|
def query_params(self) -> str:
|
||||||
|
"""Create query parameters for GitHub API."""
|
||||||
labels = ",".join(self.labels) if self.labels else self.labels
|
labels = ",".join(self.labels) if self.labels else self.labels
|
||||||
query_params_dict = {
|
query_params_dict = {
|
||||||
"milestone": self.milestone,
|
"milestone": self.milestone,
|
||||||
@ -179,4 +182,5 @@ class GitHubIssuesLoader(BaseGitHubLoader):
|
|||||||
|
|
||||||
@property
|
@property
|
||||||
def url(self) -> str:
|
def url(self) -> str:
|
||||||
|
"""Create URL for GitHub API."""
|
||||||
return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}"
|
return f"https://api.github.com/repos/{self.repo}/issues?{self.query_params}"
|
||||||
|
@ -22,21 +22,32 @@ SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
|
|||||||
|
|
||||||
|
|
||||||
class GoogleDriveLoader(BaseLoader, BaseModel):
|
class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||||
"""Loader that loads Google Docs from Google Drive."""
|
"""Loads Google Docs from Google Drive."""
|
||||||
|
|
||||||
service_account_key: Path = Path.home() / ".credentials" / "keys.json"
|
service_account_key: Path = Path.home() / ".credentials" / "keys.json"
|
||||||
|
"""Path to the service account key file."""
|
||||||
credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
|
credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
|
||||||
|
"""Path to the credentials file."""
|
||||||
token_path: Path = Path.home() / ".credentials" / "token.json"
|
token_path: Path = Path.home() / ".credentials" / "token.json"
|
||||||
|
"""Path to the token file."""
|
||||||
folder_id: Optional[str] = None
|
folder_id: Optional[str] = None
|
||||||
|
"""The folder id to load from."""
|
||||||
document_ids: Optional[List[str]] = None
|
document_ids: Optional[List[str]] = None
|
||||||
|
"""The document ids to load from."""
|
||||||
file_ids: Optional[List[str]] = None
|
file_ids: Optional[List[str]] = None
|
||||||
|
"""The file ids to load from."""
|
||||||
recursive: bool = False
|
recursive: bool = False
|
||||||
|
"""Whether to load recursively. Only applies when folder_id is given."""
|
||||||
file_types: Optional[Sequence[str]] = None
|
file_types: Optional[Sequence[str]] = None
|
||||||
|
"""The file types to load. Only applies when folder_id is given."""
|
||||||
load_trashed_files: bool = False
|
load_trashed_files: bool = False
|
||||||
|
"""Whether to load trashed files. Only applies when folder_id is given."""
|
||||||
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
|
# NOTE(MthwRobinson) - changing the file_loader_cls to type here currently
|
||||||
# results in pydantic validation errors
|
# results in pydantic validation errors
|
||||||
file_loader_cls: Any = None
|
file_loader_cls: Any = None
|
||||||
|
"""The file loader class to use."""
|
||||||
file_loader_kwargs: Dict["str", Any] = {}
|
file_loader_kwargs: Dict["str", Any] = {}
|
||||||
|
"""The file loader kwargs to use."""
|
||||||
|
|
||||||
@root_validator
|
@root_validator
|
||||||
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
def validate_inputs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads .txt web files."""
|
"""Loads .txt web files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -9,7 +9,7 @@ class GutenbergLoader(BaseLoader):
|
|||||||
"""Loader that uses urllib to load .txt web files."""
|
"""Loader that uses urllib to load .txt web files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
if not file_path.startswith("https://www.gutenberg.org"):
|
if not file_path.startswith("https://www.gutenberg.org"):
|
||||||
raise ValueError("file path must start with 'https://www.gutenberg.org'")
|
raise ValueError("file path must start with 'https://www.gutenberg.org'")
|
||||||
|
|
||||||
|
@ -5,9 +5,14 @@ from typing import List, NamedTuple, Optional, cast
|
|||||||
|
|
||||||
|
|
||||||
class FileEncoding(NamedTuple):
|
class FileEncoding(NamedTuple):
|
||||||
|
"""A file encoding as the NamedTuple."""
|
||||||
|
|
||||||
encoding: Optional[str]
|
encoding: Optional[str]
|
||||||
|
"""The encoding of the file."""
|
||||||
confidence: float
|
confidence: float
|
||||||
|
"""The confidence of the encoding."""
|
||||||
language: Optional[str]
|
language: Optional[str]
|
||||||
|
"""The language of the file."""
|
||||||
|
|
||||||
|
|
||||||
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
|
def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding]:
|
||||||
@ -15,6 +20,10 @@ def detect_file_encodings(file_path: str, timeout: int = 5) -> List[FileEncoding
|
|||||||
|
|
||||||
Returns a list of `FileEncoding` tuples with the detected encodings ordered
|
Returns a list of `FileEncoding` tuples with the detected encodings ordered
|
||||||
by confidence.
|
by confidence.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the file to detect the encoding for.
|
||||||
|
timeout: The timeout in seconds for the encoding detection.
|
||||||
"""
|
"""
|
||||||
import chardet
|
import chardet
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads HN."""
|
"""Loader that loads Hacker News."""
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -11,7 +11,7 @@ class HNLoader(WebBaseLoader):
|
|||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Get important HN webpage information.
|
"""Get important HN webpage information.
|
||||||
|
|
||||||
Components are:
|
HN webpage components are:
|
||||||
- title
|
- title
|
||||||
- content
|
- content
|
||||||
- source url,
|
- source url,
|
||||||
|
@ -20,11 +20,18 @@ class BSHTMLLoader(BaseLoader):
|
|||||||
get_text_separator: str = "",
|
get_text_separator: str = "",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialise with path, and optionally, file encoding to use, and any kwargs
|
"""Initialise with path, and optionally, file encoding to use, and any kwargs
|
||||||
to pass to the BeautifulSoup object."""
|
to pass to the BeautifulSoup object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the file to load.
|
||||||
|
open_encoding: The encoding to use when opening the file.
|
||||||
|
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
|
||||||
|
get_text_separator: The separator to use when calling get_text on the soup.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
import bs4 # noqa:F401
|
import bs4 # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ValueError(
|
raise ImportError(
|
||||||
"beautifulsoup4 package not found, please install it with "
|
"beautifulsoup4 package not found, please install it with "
|
||||||
"`pip install beautifulsoup4`"
|
"`pip install beautifulsoup4`"
|
||||||
)
|
)
|
||||||
@ -37,9 +44,9 @@ class BSHTMLLoader(BaseLoader):
|
|||||||
self.get_text_separator = get_text_separator
|
self.get_text_separator = get_text_separator
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
|
"""Load HTML document into document objects."""
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
"""Load HTML document into document objects."""
|
|
||||||
with open(self.file_path, "r", encoding=self.open_encoding) as f:
|
with open(self.file_path, "r", encoding=self.open_encoding) as f:
|
||||||
soup = BeautifulSoup(f, **self.bs_kwargs)
|
soup = BeautifulSoup(f, **self.bs_kwargs)
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads HuggingFace datasets."""
|
"""Loads HuggingFace datasets."""
|
||||||
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
from typing import Iterator, List, Mapping, Optional, Sequence, Union
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class HuggingFaceDatasetLoader(BaseLoader):
|
class HuggingFaceDatasetLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from the Hugging Face Hub."""
|
"""Load Documents from the Hugging Face Hub."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -27,14 +27,15 @@ class HuggingFaceDatasetLoader(BaseLoader):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
path: Path or name of the dataset.
|
path: Path or name of the dataset.
|
||||||
page_content_column: Page content column name.
|
page_content_column: Page content column name. Default is "text".
|
||||||
name: Name of the dataset configuration.
|
name: Name of the dataset configuration.
|
||||||
data_dir: Data directory of the dataset configuration.
|
data_dir: Data directory of the dataset configuration.
|
||||||
data_files: Path(s) to source data file(s).
|
data_files: Path(s) to source data file(s).
|
||||||
cache_dir: Directory to read/write data.
|
cache_dir: Directory to read/write data.
|
||||||
keep_in_memory: Whether to copy the dataset in-memory.
|
keep_in_memory: Whether to copy the dataset in-memory.
|
||||||
save_infos: Save the dataset information (checksums/size/splits/...).
|
save_infos: Save the dataset information (checksums/size/splits/...).
|
||||||
use_auth_token: Bearer token for remote files on the Datasets Hub.
|
Default is False.
|
||||||
|
use_auth_token: Bearer token for remote files on the Dataset Hub.
|
||||||
num_proc: Number of processes.
|
num_proc: Number of processes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@ -22,7 +22,7 @@ class IFixitLoader(BaseLoader):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, web_path: str):
|
def __init__(self, web_path: str):
|
||||||
"""Initialize with web path."""
|
"""Initialize with a web path."""
|
||||||
if not web_path.startswith("https://www.ifixit.com"):
|
if not web_path.startswith("https://www.ifixit.com"):
|
||||||
raise ValueError("web path must start with 'https://www.ifixit.com'")
|
raise ValueError("web path must start with 'https://www.ifixit.com'")
|
||||||
|
|
||||||
@ -60,6 +60,16 @@ class IFixitLoader(BaseLoader):
|
|||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
|
def load_suggestions(query: str = "", doc_type: str = "all") -> List[Document]:
|
||||||
|
"""Load suggestions.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: A query string
|
||||||
|
doc_type: The type of document to search for. Can be one of "all",
|
||||||
|
"device", "guide", "teardown", "answer", "wiki".
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
res = requests.get(
|
res = requests.get(
|
||||||
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
|
IFIXIT_BASE_URL + "/suggest/" + query + "?doctypes=" + doc_type
|
||||||
)
|
)
|
||||||
@ -89,6 +99,14 @@ class IFixitLoader(BaseLoader):
|
|||||||
def load_questions_and_answers(
|
def load_questions_and_answers(
|
||||||
self, url_override: Optional[str] = None
|
self, url_override: Optional[str] = None
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
|
"""Load a list of questions and answers.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url_override: A URL to override the default URL.
|
||||||
|
|
||||||
|
Returns: List[Document]
|
||||||
|
|
||||||
|
"""
|
||||||
loader = WebBaseLoader(self.web_path if url_override is None else url_override)
|
loader = WebBaseLoader(self.web_path if url_override is None else url_override)
|
||||||
soup = loader.scrape()
|
soup = loader.scrape()
|
||||||
|
|
||||||
@ -125,6 +143,16 @@ class IFixitLoader(BaseLoader):
|
|||||||
def load_device(
|
def load_device(
|
||||||
self, url_override: Optional[str] = None, include_guides: bool = True
|
self, url_override: Optional[str] = None, include_guides: bool = True
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
|
"""Loads a device
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url_override: A URL to override the default URL.
|
||||||
|
include_guides: Whether to include guides linked to from the device.
|
||||||
|
Defaults to True.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
documents = []
|
documents = []
|
||||||
if url_override is None:
|
if url_override is None:
|
||||||
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
|
url = IFIXIT_BASE_URL + "/wikis/CATEGORY/" + self.id
|
||||||
@ -153,6 +181,14 @@ class IFixitLoader(BaseLoader):
|
|||||||
return documents
|
return documents
|
||||||
|
|
||||||
def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
|
def load_guide(self, url_override: Optional[str] = None) -> List[Document]:
|
||||||
|
"""Load a guide
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url_override: A URL to override the default URL.
|
||||||
|
|
||||||
|
Returns: List[Document]
|
||||||
|
|
||||||
|
"""
|
||||||
if url_override is None:
|
if url_override is None:
|
||||||
url = IFIXIT_BASE_URL + "/guides/" + self.id
|
url = IFIXIT_BASE_URL + "/guides/" + self.id
|
||||||
else:
|
else:
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
"""
|
"""Loads image captions.
|
||||||
Loader that loads image captions
|
|
||||||
By default, the loader utilizes the pre-trained BLIP image captioning model.
|
By default, the loader utilizes the pre-trained BLIP image captioning model.
|
||||||
https://huggingface.co/Salesforce/blip-image-captioning-base
|
https://huggingface.co/Salesforce/blip-image-captioning-base
|
||||||
|
|
||||||
@ -13,7 +13,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ImageCaptionLoader(BaseLoader):
|
class ImageCaptionLoader(BaseLoader):
|
||||||
"""Loader that loads the captions of an image"""
|
"""Loads the captions of an image"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -23,6 +23,11 @@ class ImageCaptionLoader(BaseLoader):
|
|||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Initialize with a list of image paths
|
Initialize with a list of image paths
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path_images: A list of image paths.
|
||||||
|
blip_processor: The name of the pre-trained BLIP processor.
|
||||||
|
blip_model: The name of the pre-trained BLIP model.
|
||||||
"""
|
"""
|
||||||
if isinstance(path_images, str):
|
if isinstance(path_images, str):
|
||||||
self.image_paths = [path_images]
|
self.image_paths = [path_images]
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads IMSDb."""
|
"""Loads IMSDb."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class IMSDbLoader(WebBaseLoader):
|
class IMSDbLoader(WebBaseLoader):
|
||||||
"""Loader that loads IMSDb webpages."""
|
"""Loads IMSDb webpages."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load webpage."""
|
"""Load webpage."""
|
||||||
|
@ -20,6 +20,12 @@ class IuguLoader(BaseLoader):
|
|||||||
"""Loader that fetches data from IUGU."""
|
"""Loader that fetches data from IUGU."""
|
||||||
|
|
||||||
def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
|
def __init__(self, resource: str, api_token: Optional[str] = None) -> None:
|
||||||
|
"""Initialize the IUGU resource.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
resource: The name of the resource to fetch.
|
||||||
|
api_token: The IUGU API token to use.
|
||||||
|
"""
|
||||||
self.resource = resource
|
self.resource = resource
|
||||||
api_token = api_token or get_from_env("api_token", "IUGU_API_TOKEN")
|
api_token = api_token or get_from_env("api_token", "IUGU_API_TOKEN")
|
||||||
self.headers = {"Authorization": f"Bearer {api_token}"}
|
self.headers = {"Authorization": f"Bearer {api_token}"}
|
||||||
|
@ -30,6 +30,14 @@ class JoplinLoader(BaseLoader):
|
|||||||
port: int = 41184,
|
port: int = 41184,
|
||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
access_token: The access token to use.
|
||||||
|
port: The port where the Web Clipper service is running. Default is 41184.
|
||||||
|
host: The host where the Web Clipper service is running.
|
||||||
|
Default is localhost.
|
||||||
|
"""
|
||||||
access_token = access_token or get_from_env(
|
access_token = access_token or get_from_env(
|
||||||
"access_token", "JOPLIN_ACCESS_TOKEN"
|
"access_token", "JOPLIN_ACCESS_TOKEN"
|
||||||
)
|
)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads data from JSON."""
|
"""Loads data from JSON."""
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union
|
from typing import Any, Callable, Dict, List, Optional, Union
|
||||||
@ -8,8 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class JSONLoader(BaseLoader):
|
class JSONLoader(BaseLoader):
|
||||||
"""Loads a JSON file and references a jq schema provided to load the text into
|
"""Loads a JSON file using a jq schema.
|
||||||
documents.
|
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
|
[{"text": ...}, {"text": ...}, {"text": ...}] -> schema = .[].text
|
||||||
@ -101,7 +100,7 @@ class JSONLoader(BaseLoader):
|
|||||||
return str(content) if content is not None else ""
|
return str(content) if content is not None else ""
|
||||||
|
|
||||||
def _validate_content_key(self, data: Any) -> None:
|
def _validate_content_key(self, data: Any) -> None:
|
||||||
"""Check if content key is valid"""
|
"""Check if a content key is valid"""
|
||||||
sample = data.first()
|
sample = data.first()
|
||||||
if not isinstance(sample, dict):
|
if not isinstance(sample, dict):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads LarkSuite (FeiShu) document json dump."""
|
"""Loads LarkSuite (FeiShu) document json dump."""
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator, List
|
||||||
@ -8,10 +8,16 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class LarkSuiteDocLoader(BaseLoader):
|
class LarkSuiteDocLoader(BaseLoader):
|
||||||
"""Loader that loads LarkSuite (FeiShu) document."""
|
"""Loads LarkSuite (FeiShu) document."""
|
||||||
|
|
||||||
def __init__(self, domain: str, access_token: str, document_id: str):
|
def __init__(self, domain: str, access_token: str, document_id: str):
|
||||||
"""Initialize with domain, access_token (tenant / user), and document_id."""
|
"""Initialize with domain, access_token (tenant / user), and document_id.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
domain: The domain to load the LarkSuite.
|
||||||
|
access_token: The access_token to use.
|
||||||
|
document_id: The document_id to load.
|
||||||
|
"""
|
||||||
self.domain = domain
|
self.domain = domain
|
||||||
self.access_token = access_token
|
self.access_token = access_token
|
||||||
self.document_id = document_id
|
self.document_id = document_id
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Markdown files."""
|
"""Loads Markdown files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
@ -15,7 +15,7 @@ def _dependable_mastodon_import() -> mastodon:
|
|||||||
try:
|
try:
|
||||||
import mastodon
|
import mastodon
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ValueError(
|
raise ImportError(
|
||||||
"Mastodon.py package not found, "
|
"Mastodon.py package not found, "
|
||||||
"please install it with `pip install Mastodon.py`"
|
"please install it with `pip install Mastodon.py`"
|
||||||
)
|
)
|
||||||
@ -37,11 +37,13 @@ class MastodonTootsLoader(BaseLoader):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
mastodon_accounts: The list of Mastodon accounts to query.
|
mastodon_accounts: The list of Mastodon accounts to query.
|
||||||
number_toots: How many toots to pull for each account.
|
number_toots: How many toots to pull for each account. Default is 100.
|
||||||
exclude_replies: Whether to exclude reply toots from the load.
|
exclude_replies: Whether to exclude reply toots from the load.
|
||||||
|
Default is False.
|
||||||
access_token: An access token if toots are loaded as a Mastodon app. Can
|
access_token: An access token if toots are loaded as a Mastodon app. Can
|
||||||
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
|
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
|
||||||
api_base_url: A Mastodon API base URL to talk to, if not using the default.
|
api_base_url: A Mastodon API base URL to talk to, if not using the default.
|
||||||
|
Default is "https://mastodon.social".
|
||||||
"""
|
"""
|
||||||
mastodon = _dependable_mastodon_import()
|
mastodon = _dependable_mastodon_import()
|
||||||
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
|
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
|
||||||
|
@ -32,12 +32,17 @@ class MWDumpLoader(BaseLoader):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
|
def __init__(self, file_path: str, encoding: Optional[str] = "utf8"):
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: XML local file path
|
||||||
|
encoding: Charset encoding, defaults to "utf8"
|
||||||
|
"""
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load from file path."""
|
"""Load from a file path."""
|
||||||
import mwparserfromhell
|
import mwparserfromhell
|
||||||
import mwxml
|
import mwxml
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader to load MHTML files, enriching metadata with page title."""
|
"""Load MHTML files, enriching metadata with page title."""
|
||||||
|
|
||||||
import email
|
import email
|
||||||
import logging
|
import logging
|
||||||
@ -21,11 +21,18 @@ class MHTMLLoader(BaseLoader):
|
|||||||
get_text_separator: str = "",
|
get_text_separator: str = "",
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialise with path, and optionally, file encoding to use, and any kwargs
|
"""Initialise with path, and optionally, file encoding to use, and any kwargs
|
||||||
to pass to the BeautifulSoup object."""
|
to pass to the BeautifulSoup object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the file to load.
|
||||||
|
open_encoding: The encoding to use when opening the file.
|
||||||
|
bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
|
||||||
|
get_text_separator: The separator to use when getting text from the soup.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
import bs4 # noqa:F401
|
import bs4 # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
raise ValueError(
|
raise ImportError(
|
||||||
"beautifulsoup4 package not found, please install it with "
|
"beautifulsoup4 package not found, please install it with "
|
||||||
"`pip install beautifulsoup4`"
|
"`pip install beautifulsoup4`"
|
||||||
)
|
)
|
||||||
|
Loading…
Reference in New Issue
Block a user