mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-04 12:18:24 +00:00
docstrings document_loaders
3 (#6937)
- Updated docstrings for `document_loaders` - Mass update `"""Loader that loads` to `"""Loads` @baskaryan - please, review
This commit is contained in:
parent
9d13dcd17c
commit
5eec74d9a5
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads acreom vault from a directory."""
|
"""Loads acreom vault from a directory."""
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, List
|
from typing import Iterator, List
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads local airbyte json files."""
|
"""Loads local airbyte json files."""
|
||||||
import json
|
import json
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -8,7 +8,7 @@ from langchain.utils import stringify_dict
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteJSONLoader(BaseLoader):
|
class AirbyteJSONLoader(BaseLoader):
|
||||||
"""Loader that loads local airbyte json files."""
|
"""Loads local airbyte json files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
|
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads AZLyrics."""
|
"""Loads AZLyrics."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class AZLyricsLoader(WebBaseLoader):
|
class AZLyricsLoader(WebBaseLoader):
|
||||||
"""Loader that loads AZLyrics webpages."""
|
"""Loads AZLyrics webpages."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load webpages into Documents."""
|
"""Load webpages into Documents."""
|
||||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class BiliBiliLoader(BaseLoader):
|
class BiliBiliLoader(BaseLoader):
|
||||||
"""Loader that loads bilibili transcripts."""
|
"""Loads bilibili transcripts."""
|
||||||
|
|
||||||
def __init__(self, video_urls: List[str]):
|
def __init__(self, video_urls: List[str]):
|
||||||
"""Initialize with bilibili url.
|
"""Initialize with bilibili url.
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads all documents from a blackboard course."""
|
"""Loads all documents from a blackboard course."""
|
||||||
import contextlib
|
import contextlib
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
"""Load conversations from ChatGPT data export"""
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -31,7 +32,7 @@ class ChatGPTLoader(BaseLoader):
|
|||||||
"""Load conversations from exported ChatGPT data."""
|
"""Load conversations from exported ChatGPT data."""
|
||||||
|
|
||||||
def __init__(self, log_file: str, num_logs: int = -1):
|
def __init__(self, log_file: str, num_logs: int = -1):
|
||||||
"""
|
"""Initialize a class object.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
log_file: Path to the log file
|
log_file: Path to the log file
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads College Confidential."""
|
"""Loads College Confidential."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class CollegeConfidentialLoader(WebBaseLoader):
|
class CollegeConfidentialLoader(WebBaseLoader):
|
||||||
"""Loader that loads College Confidential webpages."""
|
"""Loads College Confidential webpages."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load webpages as Documents."""
|
"""Load webpages as Documents."""
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads EPub files."""
|
"""Loads EPub files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Microsoft Excel files."""
|
"""Loads Microsoft Excel files."""
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Facebook chat json dump."""
|
"""Loads Facebook chat json dump."""
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Figma files json dump."""
|
"""Loads Figma files json dump."""
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads GitBook."""
|
"""Loads GitBook."""
|
||||||
from typing import Any, List, Optional
|
from typing import Any, List, Optional
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads data from Google Drive."""
|
"""Loads data from Google Drive."""
|
||||||
|
|
||||||
# Prerequisites:
|
# Prerequisites:
|
||||||
# 1. Create a Google Cloud project
|
# 1. Create a Google Cloud project
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Hacker News."""
|
"""Loads HN."""
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads iFixit data."""
|
"""Loads iFixit data."""
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads image files."""
|
"""Loads image files."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
@ -37,13 +37,13 @@ class MastodonTootsLoader(BaseLoader):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
mastodon_accounts: The list of Mastodon accounts to query.
|
mastodon_accounts: The list of Mastodon accounts to query.
|
||||||
number_toots: How many toots to pull for each account. Default is 100.
|
number_toots: How many toots to pull for each account. Defaults to 100.
|
||||||
exclude_replies: Whether to exclude reply toots from the load.
|
exclude_replies: Whether to exclude reply toots from the load.
|
||||||
Default is False.
|
Defaults to False.
|
||||||
access_token: An access token if toots are loaded as a Mastodon app. Can
|
access_token: An access token if toots are loaded as a Mastodon app. Can
|
||||||
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
|
also be specified via the environment variables "MASTODON_ACCESS_TOKEN".
|
||||||
api_base_url: A Mastodon API base URL to talk to, if not using the default.
|
api_base_url: A Mastodon API base URL to talk to, if not using the default.
|
||||||
Default is "https://mastodon.social".
|
Defaults to "https://mastodon.social".
|
||||||
"""
|
"""
|
||||||
mastodon = _dependable_mastodon_import()
|
mastodon = _dependable_mastodon_import()
|
||||||
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
|
access_token = access_token or os.environ.get("MASTODON_ACCESS_TOKEN")
|
||||||
|
@ -24,10 +24,11 @@ class MHTMLLoader(BaseLoader):
|
|||||||
to pass to the BeautifulSoup object.
|
to pass to the BeautifulSoup object.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
file_path: The path to the file to load.
|
file_path: Path to file to load.
|
||||||
open_encoding: The encoding to use when opening the file.
|
open_encoding: The encoding to use when opening the file.
|
||||||
bs_kwargs: soup kwargs to pass to the BeautifulSoup object.
|
bs_kwargs: Any kwargs to pass to the BeautifulSoup object.
|
||||||
get_text_separator: The separator to use when getting text from the soup.
|
get_text_separator: The separator to use when getting the text
|
||||||
|
from the soup.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
import bs4 # noqa:F401
|
import bs4 # noqa:F401
|
||||||
|
@ -35,6 +35,16 @@ class ModernTreasuryLoader(BaseLoader):
|
|||||||
organization_id: Optional[str] = None,
|
organization_id: Optional[str] = None,
|
||||||
api_key: Optional[str] = None,
|
api_key: Optional[str] = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
resource: The Modern Treasury resource to load.
|
||||||
|
organization_id: The Modern Treasury organization ID. It can also be
|
||||||
|
specified via the environment variable
|
||||||
|
"MODERN_TREASURY_ORGANIZATION_ID".
|
||||||
|
api_key: The Modern Treasury API key. It can also be specified via
|
||||||
|
the environment variable "MODERN_TREASURY_API_KEY".
|
||||||
|
"""
|
||||||
self.resource = resource
|
self.resource = resource
|
||||||
organization_id = organization_id or get_from_env(
|
organization_id = organization_id or get_from_env(
|
||||||
"organization_id", "MODERN_TREASURY_ORGANIZATION_ID"
|
"organization_id", "MODERN_TREASURY_ORGANIZATION_ID"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads .ipynb notebook files."""
|
"""Loads .ipynb notebook files."""
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
@ -10,7 +10,18 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
def concatenate_cells(
|
def concatenate_cells(
|
||||||
cell: dict, include_outputs: bool, max_output_length: int, traceback: bool
|
cell: dict, include_outputs: bool, max_output_length: int, traceback: bool
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Combine cells information in a readable format ready to be used."""
|
"""Combine cells information in a readable format ready to be used.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cell: A dictionary
|
||||||
|
include_outputs: Whether to include the outputs of the cell.
|
||||||
|
max_output_length: Maximum length of the output to be displayed.
|
||||||
|
traceback: Whether to return a traceback of the error.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A string with the cell information.
|
||||||
|
|
||||||
|
"""
|
||||||
cell_type = cell["cell_type"]
|
cell_type = cell["cell_type"]
|
||||||
source = cell["source"]
|
source = cell["source"]
|
||||||
output = cell["outputs"]
|
output = cell["outputs"]
|
||||||
@ -45,7 +56,7 @@ def concatenate_cells(
|
|||||||
|
|
||||||
|
|
||||||
def remove_newlines(x: Any) -> Any:
|
def remove_newlines(x: Any) -> Any:
|
||||||
"""Remove recursively newlines, no matter the data structure they are stored in."""
|
"""Recursively removes newlines, no matter the data structure they are stored in."""
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
if isinstance(x, str):
|
if isinstance(x, str):
|
||||||
@ -59,7 +70,7 @@ def remove_newlines(x: Any) -> Any:
|
|||||||
|
|
||||||
|
|
||||||
class NotebookLoader(BaseLoader):
|
class NotebookLoader(BaseLoader):
|
||||||
"""Loader that loads .ipynb notebook files."""
|
"""Loads .ipynb notebook files."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -69,7 +80,19 @@ class NotebookLoader(BaseLoader):
|
|||||||
remove_newline: bool = False,
|
remove_newline: bool = False,
|
||||||
traceback: bool = False,
|
traceback: bool = False,
|
||||||
):
|
):
|
||||||
"""Initialize with path."""
|
"""Initialize with path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: The path to load the notebook from.
|
||||||
|
include_outputs: Whether to include the outputs of the cell.
|
||||||
|
Defaults to False.
|
||||||
|
max_output_length: Maximum length of the output to be displayed.
|
||||||
|
Defaults to 10.
|
||||||
|
remove_newline: Whether to remove newlines from the notebook.
|
||||||
|
Defaults to False.
|
||||||
|
traceback: Whether to return a traceback of the error.
|
||||||
|
Defaults to False.
|
||||||
|
"""
|
||||||
self.file_path = path
|
self.file_path = path
|
||||||
self.include_outputs = include_outputs
|
self.include_outputs = include_outputs
|
||||||
self.max_output_length = max_output_length
|
self.max_output_length = max_output_length
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Notion directory dump."""
|
"""Loads Notion directory dump."""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class NotionDirectoryLoader(BaseLoader):
|
class NotionDirectoryLoader(BaseLoader):
|
||||||
"""Loader that loads Notion directory dump."""
|
"""Loads Notion directory dump."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with path."""
|
"""Initialize with a file path."""
|
||||||
self.file_path = path
|
self.file_path = path
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
|
@ -15,11 +15,12 @@ BLOCK_URL = NOTION_BASE_URL + "/blocks/{block_id}/children"
|
|||||||
|
|
||||||
class NotionDBLoader(BaseLoader):
|
class NotionDBLoader(BaseLoader):
|
||||||
"""Notion DB Loader.
|
"""Notion DB Loader.
|
||||||
Reads content from pages within a Noton Database.
|
Reads content from pages within a Notion Database.
|
||||||
Args:
|
Args:
|
||||||
integration_token (str): Notion integration token.
|
integration_token (str): Notion integration token.
|
||||||
database_id (str): Notion database id.
|
database_id (str): Notion database id.
|
||||||
request_timeout_sec (int): Timeout for Notion requests in seconds.
|
request_timeout_sec (int): Timeout for Notion requests in seconds.
|
||||||
|
Defaults to 10.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -75,7 +76,11 @@ class NotionDBLoader(BaseLoader):
|
|||||||
return pages
|
return pages
|
||||||
|
|
||||||
def load_page(self, page_summary: Dict[str, Any]) -> Document:
|
def load_page(self, page_summary: Dict[str, Any]) -> Document:
|
||||||
"""Read a page."""
|
"""Read a page.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
page_summary: Page summary from Notion API.
|
||||||
|
"""
|
||||||
page_id = page_summary["id"]
|
page_id = page_summary["id"]
|
||||||
|
|
||||||
# load properties as metadata
|
# load properties as metadata
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Obsidian directory dump."""
|
"""Loads Obsidian directory dump."""
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -8,14 +8,21 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ObsidianLoader(BaseLoader):
|
class ObsidianLoader(BaseLoader):
|
||||||
"""Loader that loads Obsidian files from disk."""
|
"""Loads Obsidian files from disk."""
|
||||||
|
|
||||||
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
|
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
|
||||||
):
|
):
|
||||||
"""Initialize with path."""
|
"""Initialize with a path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path: Path to the directory containing the Obsidian files.
|
||||||
|
encoding: Charset encoding, defaults to "UTF-8"
|
||||||
|
collect_metadata: Whether to collect metadata from the front matter.
|
||||||
|
Defaults to True.
|
||||||
|
"""
|
||||||
self.file_path = path
|
self.file_path = path
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
self.collect_metadata = collect_metadata
|
self.collect_metadata = collect_metadata
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Open Office ODT files."""
|
"""Loads OpenOffice ODT files."""
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
@ -8,11 +8,19 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredODTLoader(UnstructuredFileLoader):
|
class UnstructuredODTLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load open office ODT files."""
|
"""Loader that uses unstructured to load OpenOffice ODT files."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the file to load.
|
||||||
|
mode: The mode to use when loading the file. Can be one of "single",
|
||||||
|
"multi", or "all". Default is "single".
|
||||||
|
**unstructured_kwargs: Any kwargs to pass to the unstructured.
|
||||||
|
"""
|
||||||
validate_unstructured_version(min_unstructured_version="0.6.3")
|
validate_unstructured_version(min_unstructured_version="0.6.3")
|
||||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads data from OneDrive"""
|
"""Loads data from OneDrive"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@ -60,11 +60,18 @@ class _SupportedFileTypes(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class OneDriveLoader(BaseLoader, BaseModel):
|
class OneDriveLoader(BaseLoader, BaseModel):
|
||||||
|
"""Loads data from OneDrive."""
|
||||||
|
|
||||||
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
|
settings: _OneDriveSettings = Field(default_factory=_OneDriveSettings)
|
||||||
|
""" The settings for the OneDrive API client."""
|
||||||
drive_id: str = Field(...)
|
drive_id: str = Field(...)
|
||||||
|
""" The ID of the OneDrive drive to load data from."""
|
||||||
folder_path: Optional[str] = None
|
folder_path: Optional[str] = None
|
||||||
|
""" The path to the folder to load data from."""
|
||||||
object_ids: Optional[List[str]] = None
|
object_ids: Optional[List[str]] = None
|
||||||
|
""" The IDs of the objects to load data from."""
|
||||||
auth_with_token: bool = False
|
auth_with_token: bool = False
|
||||||
|
""" Whether to authenticate with a token or not. Defaults to False."""
|
||||||
|
|
||||||
def _auth(self) -> Type[Account]:
|
def _auth(self) -> Type[Account]:
|
||||||
"""
|
"""
|
||||||
|
@ -16,10 +16,15 @@ CHUNK_SIZE = 1024 * 1024 * 5
|
|||||||
|
|
||||||
|
|
||||||
class OneDriveFileLoader(BaseLoader, BaseModel):
|
class OneDriveFileLoader(BaseLoader, BaseModel):
|
||||||
|
"""Loads a file from OneDrive."""
|
||||||
|
|
||||||
file: File = Field(...)
|
file: File = Field(...)
|
||||||
|
"""The file to load."""
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
"""Allow arbitrary types. This is needed for the File type. Default is True.
|
||||||
|
See https://pydantic-docs.helpmanual.io/usage/types/#arbitrary-types-allowed"""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load Documents"""
|
"""Load Documents"""
|
||||||
|
@ -5,13 +5,19 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class OpenCityDataLoader(BaseLoader):
|
class OpenCityDataLoader(BaseLoader):
|
||||||
"""Loader that loads Open city data."""
|
"""Loads Open City data."""
|
||||||
|
|
||||||
def __init__(self, city_id: str, dataset_id: str, limit: int):
|
def __init__(self, city_id: str, dataset_id: str, limit: int):
|
||||||
"""Initialize with dataset_id"""
|
"""Initialize with dataset_id.
|
||||||
""" Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6 """
|
Example: https://dev.socrata.com/foundry/data.sfgov.org/vw6y-z8j6
|
||||||
""" e.g., city_id = data.sfgov.org """
|
e.g., city_id = data.sfgov.org
|
||||||
""" e.g., dataset_id = vw6y-z8j6 """
|
e.g., dataset_id = vw6y-z8j6
|
||||||
|
|
||||||
|
Args:
|
||||||
|
city_id: The Open City city identifier.
|
||||||
|
dataset_id: The Open City dataset identifier.
|
||||||
|
limit: The maximum number of documents to load.
|
||||||
|
"""
|
||||||
self.city_id = city_id
|
self.city_id = city_id
|
||||||
self.dataset_id = dataset_id
|
self.dataset_id = dataset_id
|
||||||
self.limit = limit
|
self.limit = limit
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Org-Mode files."""
|
"""Loads Org-Mode files."""
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
@ -13,6 +13,14 @@ class UnstructuredOrgModeLoader(UnstructuredFileLoader):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the file to load.
|
||||||
|
mode: The mode to load the file from. Default is "single".
|
||||||
|
**unstructured_kwargs: Any additional keyword arguments to pass
|
||||||
|
to the unstructured.
|
||||||
|
"""
|
||||||
validate_unstructured_version(min_unstructured_version="0.7.9")
|
validate_unstructured_version(min_unstructured_version="0.7.9")
|
||||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads PDF files."""
|
"""Loads PDF files."""
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -41,11 +41,11 @@ class BasePDFLoader(BaseLoader, ABC):
|
|||||||
"""Base loader class for PDF files.
|
"""Base loader class for PDF files.
|
||||||
|
|
||||||
Defaults to check for local file, but if the file is a web path, it will download it
|
Defaults to check for local file, but if the file is a web path, it will download it
|
||||||
to a temporary file, and use that, then clean up the temporary file after completion
|
to a temporary file, use it, then clean up the temporary file after completion
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
self.file_path = file_path
|
self.file_path = file_path
|
||||||
self.web_path = None
|
self.web_path = None
|
||||||
if "~" in self.file_path:
|
if "~" in self.file_path:
|
||||||
@ -86,7 +86,7 @@ class BasePDFLoader(BaseLoader, ABC):
|
|||||||
|
|
||||||
|
|
||||||
class OnlinePDFLoader(BasePDFLoader):
|
class OnlinePDFLoader(BasePDFLoader):
|
||||||
"""Loader that loads online PDFs."""
|
"""Loads online PDFs."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load documents."""
|
"""Load documents."""
|
||||||
@ -97,13 +97,13 @@ class OnlinePDFLoader(BasePDFLoader):
|
|||||||
class PyPDFLoader(BasePDFLoader):
|
class PyPDFLoader(BasePDFLoader):
|
||||||
"""Loads a PDF with pypdf and chunks at character level.
|
"""Loads a PDF with pypdf and chunks at character level.
|
||||||
|
|
||||||
Loader also stores page numbers in metadatas.
|
Loader also stores page numbers in metadata.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, password: Optional[Union[str, bytes]] = None
|
self, file_path: str, password: Optional[Union[str, bytes]] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
try:
|
try:
|
||||||
import pypdf # noqa:F401
|
import pypdf # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -129,7 +129,7 @@ class PyPDFium2Loader(BasePDFLoader):
|
|||||||
"""Loads a PDF with pypdfium2 and chunks at character level."""
|
"""Loads a PDF with pypdfium2 and chunks at character level."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
super().__init__(file_path)
|
super().__init__(file_path)
|
||||||
self.parser = PyPDFium2Parser()
|
self.parser = PyPDFium2Parser()
|
||||||
|
|
||||||
@ -148,7 +148,7 @@ class PyPDFium2Loader(BasePDFLoader):
|
|||||||
class PyPDFDirectoryLoader(BaseLoader):
|
class PyPDFDirectoryLoader(BaseLoader):
|
||||||
"""Loads a directory with PDF files with pypdf and chunks at character level.
|
"""Loads a directory with PDF files with pypdf and chunks at character level.
|
||||||
|
|
||||||
Loader also stores page numbers in metadatas.
|
Loader also stores page numbers in metadata.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
@ -222,7 +222,7 @@ class PDFMinerPDFasHTMLLoader(BasePDFLoader):
|
|||||||
"""Loader that uses PDFMiner to load PDF files as HTML content."""
|
"""Loader that uses PDFMiner to load PDF files as HTML content."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
try:
|
try:
|
||||||
from pdfminer.high_level import extract_text_to_fp # noqa:F401
|
from pdfminer.high_level import extract_text_to_fp # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -256,7 +256,7 @@ class PyMuPDFLoader(BasePDFLoader):
|
|||||||
"""Loader that uses PyMuPDF to load PDF files."""
|
"""Loader that uses PyMuPDF to load PDF files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str) -> None:
|
def __init__(self, file_path: str) -> None:
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
try:
|
try:
|
||||||
import fitz # noqa:F401
|
import fitz # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -278,6 +278,8 @@ class PyMuPDFLoader(BasePDFLoader):
|
|||||||
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
# MathpixPDFLoader implementation taken largely from Daniel Gross's:
|
||||||
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
|
# https://gist.github.com/danielgross/3ab4104e14faccc12b49200843adab21
|
||||||
class MathpixPDFLoader(BasePDFLoader):
|
class MathpixPDFLoader(BasePDFLoader):
|
||||||
|
"""This class uses Mathpix service to load PDF files."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
file_path: str,
|
file_path: str,
|
||||||
@ -286,6 +288,16 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
should_clean_pdf: bool = False,
|
should_clean_pdf: bool = False,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> None:
|
) -> None:
|
||||||
|
"""Initialize with a file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: a file for loading.
|
||||||
|
processed_file_format: a format of the processed file. Default is "mmd".
|
||||||
|
max_wait_time_seconds: a maximum time to wait for the response from
|
||||||
|
the server. Default is 500.
|
||||||
|
should_clean_pdf: a flag to clean the PDF file. Default is False.
|
||||||
|
**kwargs: additional keyword arguments.
|
||||||
|
"""
|
||||||
super().__init__(file_path)
|
super().__init__(file_path)
|
||||||
self.mathpix_api_key = get_from_dict_or_env(
|
self.mathpix_api_key = get_from_dict_or_env(
|
||||||
kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
|
kwargs, "mathpix_api_key", "MATHPIX_API_KEY"
|
||||||
@ -324,6 +336,13 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
raise ValueError("Unable to send PDF to Mathpix.")
|
raise ValueError("Unable to send PDF to Mathpix.")
|
||||||
|
|
||||||
def wait_for_processing(self, pdf_id: str) -> None:
|
def wait_for_processing(self, pdf_id: str) -> None:
|
||||||
|
"""Wait for processing to complete.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
pdf_id: a PDF id.
|
||||||
|
|
||||||
|
Returns: None
|
||||||
|
"""
|
||||||
url = self.url + "/" + pdf_id
|
url = self.url + "/" + pdf_id
|
||||||
for _ in range(0, self.max_wait_time_seconds, 5):
|
for _ in range(0, self.max_wait_time_seconds, 5):
|
||||||
response = requests.get(url, headers=self.headers)
|
response = requests.get(url, headers=self.headers)
|
||||||
@ -346,6 +365,14 @@ class MathpixPDFLoader(BasePDFLoader):
|
|||||||
return response.content.decode("utf-8")
|
return response.content.decode("utf-8")
|
||||||
|
|
||||||
def clean_pdf(self, contents: str) -> str:
|
def clean_pdf(self, contents: str) -> str:
|
||||||
|
"""Clean the PDF file.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
contents: a PDF file contents.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
contents = "\n".join(
|
contents = "\n".join(
|
||||||
[line for line in contents.split("\n") if not line.startswith("![]")]
|
[line for line in contents.split("\n") if not line.startswith("![]")]
|
||||||
)
|
)
|
||||||
@ -375,7 +402,7 @@ class PDFPlumberLoader(BasePDFLoader):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
|
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
try:
|
try:
|
||||||
import pdfplumber # noqa:F401
|
import pdfplumber # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads powerpoint files."""
|
"""Loads PowerPoint files."""
|
||||||
import os
|
import os
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -6,7 +6,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
class UnstructuredPowerPointLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load powerpoint files."""
|
"""Loader that uses unstructured to load PowerPoint files."""
|
||||||
|
|
||||||
def _get_elements(self) -> List:
|
def _get_elements(self) -> List:
|
||||||
from unstructured.__version__ import __version__ as __unstructured_version__
|
from unstructured.__version__ import __version__ as __unstructured_version__
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads documents from Psychic.dev."""
|
"""Loads documents from Psychic.dev."""
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,12 +6,18 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class PsychicLoader(BaseLoader):
|
class PsychicLoader(BaseLoader):
|
||||||
"""Loader that loads documents from Psychic.dev."""
|
"""Loads documents from Psychic.dev."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, api_key: str, account_id: str, connector_id: Optional[str] = None
|
self, api_key: str, account_id: str, connector_id: Optional[str] = None
|
||||||
):
|
):
|
||||||
"""Initialize with API key, connector id, and account id."""
|
"""Initialize with API key, connector id, and account id.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
api_key: The Psychic API key.
|
||||||
|
account_id: The Psychic account id.
|
||||||
|
connector_id: The Psychic connector id.
|
||||||
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from psychicapi import ConnectorId, Psychic # noqa: F401
|
from psychicapi import ConnectorId, Psychic # noqa: F401
|
||||||
|
@ -23,7 +23,15 @@ class PySparkDataFrameLoader(BaseLoader):
|
|||||||
page_content_column: str = "text",
|
page_content_column: str = "text",
|
||||||
fraction_of_memory: float = 0.1,
|
fraction_of_memory: float = 0.1,
|
||||||
):
|
):
|
||||||
"""Initialize with a Spark DataFrame object."""
|
"""Initialize with a Spark DataFrame object.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
spark_session: The SparkSession object.
|
||||||
|
df: The Spark DataFrame object.
|
||||||
|
page_content_column: The name of the column containing the page content.
|
||||||
|
Defaults to "text".
|
||||||
|
fraction_of_memory: The fraction of memory to use. Defaults to 0.1.
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from pyspark.sql import DataFrame, SparkSession
|
from pyspark.sql import DataFrame, SparkSession
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -48,7 +56,7 @@ class PySparkDataFrameLoader(BaseLoader):
|
|||||||
self.column_names = self.df.columns
|
self.column_names = self.df.columns
|
||||||
|
|
||||||
def get_num_rows(self) -> Tuple[int, int]:
|
def get_num_rows(self) -> Tuple[int, int]:
|
||||||
"""Gets the amount of "feasible" rows for the DataFrame"""
|
"""Gets the number of "feasible" rows for the DataFrame"""
|
||||||
try:
|
try:
|
||||||
import psutil
|
import psutil
|
||||||
except ImportError as e:
|
except ImportError as e:
|
||||||
|
@ -9,6 +9,11 @@ class PythonLoader(TextLoader):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
|
"""Initialize with a file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the file to load.
|
||||||
|
"""
|
||||||
with open(file_path, "rb") as f:
|
with open(file_path, "rb") as f:
|
||||||
encoding, _ = tokenize.detect_encoding(f.readline)
|
encoding, _ = tokenize.detect_encoding(f.readline)
|
||||||
super().__init__(file_path=file_path, encoding=encoding)
|
super().__init__(file_path=file_path, encoding=encoding)
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads ReadTheDocs documentation directory dump."""
|
"""Loads ReadTheDocs documentation directory dump."""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, List, Optional, Tuple, Union
|
from typing import Any, List, Optional, Tuple, Union
|
||||||
|
|
||||||
@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ReadTheDocsLoader(BaseLoader):
|
class ReadTheDocsLoader(BaseLoader):
|
||||||
"""Loader that loads ReadTheDocs documentation directory dump."""
|
"""Loads ReadTheDocs documentation directory dump."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -20,7 +20,7 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
"""
|
"""
|
||||||
Initialize ReadTheDocsLoader
|
Initialize ReadTheDocsLoader
|
||||||
|
|
||||||
The loader loops over all files under `path` and extract the actual content of
|
The loader loops over all files under `path` and extracts the actual content of
|
||||||
the files by retrieving main html tags. Default main html tags include
|
the files by retrieving main html tags. Default main html tags include
|
||||||
`<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
|
`<main id="main-content>`, <`div role="main>`, and `<article role="main">`. You
|
||||||
can also define your own html tags by passing custom_html_tag, e.g.
|
can also define your own html tags by passing custom_html_tag, e.g.
|
||||||
@ -31,7 +31,7 @@ class ReadTheDocsLoader(BaseLoader):
|
|||||||
Args:
|
Args:
|
||||||
path: The location of pulled readthedocs folder.
|
path: The location of pulled readthedocs folder.
|
||||||
encoding: The encoding with which to open the documents.
|
encoding: The encoding with which to open the documents.
|
||||||
errors: Specifies how encoding and decoding errors are to be handled—this
|
errors: Specify how encoding and decoding errors are to be handled—this
|
||||||
cannot be used in binary mode.
|
cannot be used in binary mode.
|
||||||
custom_html_tag: Optional custom html tag to retrieve the content from
|
custom_html_tag: Optional custom html tag to retrieve the content from
|
||||||
files.
|
files.
|
||||||
|
@ -8,17 +8,27 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class RecursiveUrlLoader(BaseLoader):
|
class RecursiveUrlLoader(BaseLoader):
|
||||||
"""Loader that loads all child links from a given url."""
|
"""Loads all child links from a given url."""
|
||||||
|
|
||||||
def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
|
def __init__(self, url: str, exclude_dirs: Optional[str] = None) -> None:
|
||||||
"""Initialize with URL to crawl and any sub-directories to exclude."""
|
"""Initialize with URL to crawl and any subdirectories to exclude.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to crawl.
|
||||||
|
exclude_dirs: A list of subdirectories to exclude.
|
||||||
|
"""
|
||||||
self.url = url
|
self.url = url
|
||||||
self.exclude_dirs = exclude_dirs
|
self.exclude_dirs = exclude_dirs
|
||||||
|
|
||||||
def get_child_links_recursive(
|
def get_child_links_recursive(
|
||||||
self, url: str, visited: Optional[Set[str]] = None
|
self, url: str, visited: Optional[Set[str]] = None
|
||||||
) -> Set[str]:
|
) -> Set[str]:
|
||||||
"""Recursively get all child links starting with the path of the input URL."""
|
"""Recursively get all child links starting with the path of the input URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to crawl.
|
||||||
|
visited: A set of visited URLs.
|
||||||
|
"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
@ -39,7 +49,7 @@ class RecursiveUrlLoader(BaseLoader):
|
|||||||
if not parent_url.endswith("/"):
|
if not parent_url.endswith("/"):
|
||||||
parent_url += "/"
|
parent_url += "/"
|
||||||
|
|
||||||
# Exclude the root and parent from list
|
# Exclude the root and parent from a list
|
||||||
visited = set() if visited is None else visited
|
visited = set() if visited is None else visited
|
||||||
|
|
||||||
# Exclude the links that start with any of the excluded directories
|
# Exclude the links that start with any of the excluded directories
|
||||||
|
@ -23,7 +23,7 @@ def _dependable_praw_import() -> praw:
|
|||||||
class RedditPostsLoader(BaseLoader):
|
class RedditPostsLoader(BaseLoader):
|
||||||
"""Reddit posts loader.
|
"""Reddit posts loader.
|
||||||
Read posts on a subreddit.
|
Read posts on a subreddit.
|
||||||
First you need to go to
|
First, you need to go to
|
||||||
https://www.reddit.com/prefs/apps/
|
https://www.reddit.com/prefs/apps/
|
||||||
and create your application
|
and create your application
|
||||||
"""
|
"""
|
||||||
@ -38,6 +38,20 @@ class RedditPostsLoader(BaseLoader):
|
|||||||
categories: Sequence[str] = ["new"],
|
categories: Sequence[str] = ["new"],
|
||||||
number_posts: Optional[int] = 10,
|
number_posts: Optional[int] = 10,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Initialize with client_id, client_secret, user_agent, search_queries, mode,
|
||||||
|
categories, number_posts.
|
||||||
|
Example: https://www.reddit.com/r/learnpython/
|
||||||
|
|
||||||
|
Args:
|
||||||
|
client_id: Reddit client id.
|
||||||
|
client_secret: Reddit client secret.
|
||||||
|
user_agent: Reddit user agent.
|
||||||
|
search_queries: The search queries.
|
||||||
|
mode: The mode.
|
||||||
|
categories: The categories. Default: ["new"]
|
||||||
|
number_posts: The number of posts. Default: 10
|
||||||
|
"""
|
||||||
self.client_id = client_id
|
self.client_id = client_id
|
||||||
self.client_secret = client_secret
|
self.client_secret = client_secret
|
||||||
self.user_agent = user_agent
|
self.user_agent = user_agent
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Roam directory dump."""
|
"""Loads Roam directory dump."""
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -7,10 +7,10 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class RoamLoader(BaseLoader):
|
class RoamLoader(BaseLoader):
|
||||||
"""Loader that loads Roam files from disk."""
|
"""Loads Roam files from disk."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with path."""
|
"""Initialize with a path."""
|
||||||
self.file_path = path
|
self.file_path = path
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads RST files."""
|
"""Loads RST files."""
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
@ -13,6 +13,16 @@ class UnstructuredRSTLoader(UnstructuredFileLoader):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Initialize with a file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the file to load.
|
||||||
|
mode: The mode to use for partitioning. See unstructured for details.
|
||||||
|
Defaults to "single".
|
||||||
|
**unstructured_kwargs: Additional keyword arguments to pass
|
||||||
|
to unstructured.
|
||||||
|
"""
|
||||||
validate_unstructured_version(min_unstructured_version="0.7.5")
|
validate_unstructured_version(min_unstructured_version="0.7.5")
|
||||||
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
super().__init__(file_path=file_path, mode=mode, **unstructured_kwargs)
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads rich text files."""
|
"""Loads rich text files."""
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
@ -13,6 +13,16 @@ class UnstructuredRTFLoader(UnstructuredFileLoader):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
self, file_path: str, mode: str = "single", **unstructured_kwargs: Any
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Initialize with a file path.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: The path to the file to load.
|
||||||
|
mode: The mode to use for partitioning. See unstructured for details.
|
||||||
|
Defaults to "single".
|
||||||
|
**unstructured_kwargs: Additional keyword arguments to pass
|
||||||
|
to unstructured.
|
||||||
|
"""
|
||||||
min_unstructured_version = "0.5.12"
|
min_unstructured_version = "0.5.12"
|
||||||
if not satisfies_min_unstructured_version(min_unstructured_version):
|
if not satisfies_min_unstructured_version(min_unstructured_version):
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loading logic for loading documents from an s3 directory."""
|
"""Loading logic for loading documents from an AWS S3 directory."""
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -7,10 +7,15 @@ from langchain.document_loaders.s3_file import S3FileLoader
|
|||||||
|
|
||||||
|
|
||||||
class S3DirectoryLoader(BaseLoader):
|
class S3DirectoryLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from s3."""
|
"""Loading logic for loading documents from an AWS S3."""
|
||||||
|
|
||||||
def __init__(self, bucket: str, prefix: str = ""):
|
def __init__(self, bucket: str, prefix: str = ""):
|
||||||
"""Initialize with bucket and key name."""
|
"""Initialize with bucket and key name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bucket: The name of the S3 bucket.
|
||||||
|
prefix: The prefix of the S3 key. Defaults to "".
|
||||||
|
"""
|
||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.prefix = prefix
|
self.prefix = prefix
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loading logic for loading documents from an s3 file."""
|
"""Loading logic for loading documents from an AWS S3 file."""
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -9,10 +9,15 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class S3FileLoader(BaseLoader):
|
class S3FileLoader(BaseLoader):
|
||||||
"""Loading logic for loading documents from s3."""
|
"""Loading logic for loading documents from an AWS S3 file."""
|
||||||
|
|
||||||
def __init__(self, bucket: str, key: str):
|
def __init__(self, bucket: str, key: str):
|
||||||
"""Initialize with bucket and key name."""
|
"""Initialize with bucket and key name.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
bucket: The name of the S3 bucket.
|
||||||
|
key: The key of the S3 object.
|
||||||
|
"""
|
||||||
self.bucket = bucket
|
self.bucket = bucket
|
||||||
self.key = key
|
self.key = key
|
||||||
|
|
||||||
|
@ -42,11 +42,12 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
urls that are parsed and loaded
|
urls that are parsed and loaded
|
||||||
parsing_function: Function to parse bs4.Soup output
|
parsing_function: Function to parse bs4.Soup output
|
||||||
blocksize: number of sitemap locations per block
|
blocksize: number of sitemap locations per block
|
||||||
blocknum: the number of the block that should be loaded - zero indexed
|
blocknum: the number of the block that should be loaded - zero indexed.
|
||||||
|
Default: 0
|
||||||
meta_function: Function to parse bs4.Soup output for metadata
|
meta_function: Function to parse bs4.Soup output for metadata
|
||||||
remember when setting this method to also copy metadata["loc"]
|
remember when setting this method to also copy metadata["loc"]
|
||||||
to metadata["source"] if you are using this field
|
to metadata["source"] if you are using this field
|
||||||
is_local: whether the sitemap is a local file
|
is_local: whether the sitemap is a local file. Default: False
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if blocksize is not None and blocksize < 1:
|
if blocksize is not None and blocksize < 1:
|
||||||
@ -72,7 +73,14 @@ class SitemapLoader(WebBaseLoader):
|
|||||||
self.is_local = is_local
|
self.is_local = is_local
|
||||||
|
|
||||||
def parse_sitemap(self, soup: Any) -> List[dict]:
|
def parse_sitemap(self, soup: Any) -> List[dict]:
|
||||||
"""Parse sitemap xml and load into a list of dicts."""
|
"""Parse sitemap xml and load into a list of dicts.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
soup: BeautifulSoup object.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of dicts.
|
||||||
|
"""
|
||||||
els = []
|
els = []
|
||||||
for url in soup.find_all("url"):
|
for url in soup.find_all("url"):
|
||||||
loc = url.find("loc")
|
loc = url.find("loc")
|
||||||
|
@ -9,7 +9,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class SlackDirectoryLoader(BaseLoader):
|
class SlackDirectoryLoader(BaseLoader):
|
||||||
"""Loader for loading documents from a Slack directory dump."""
|
"""Loads documents from a Slack directory dump."""
|
||||||
|
|
||||||
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
|
||||||
"""Initialize the SlackDirectoryLoader.
|
"""Initialize the SlackDirectoryLoader.
|
||||||
|
@ -41,6 +41,7 @@ class SnowflakeLoader(BaseLoader):
|
|||||||
role: Snowflake role.
|
role: Snowflake role.
|
||||||
database: Snowflake database
|
database: Snowflake database
|
||||||
schema: Snowflake schema
|
schema: Snowflake schema
|
||||||
|
parameters: Optional. Parameters to pass to the query.
|
||||||
page_content_columns: Optional. Columns written to Document `page_content`.
|
page_content_columns: Optional. Columns written to Document `page_content`.
|
||||||
metadata_columns: Optional. Columns written to Document `metadata`.
|
metadata_columns: Optional. Columns written to Document `metadata`.
|
||||||
"""
|
"""
|
||||||
@ -62,7 +63,7 @@ class SnowflakeLoader(BaseLoader):
|
|||||||
try:
|
try:
|
||||||
import snowflake.connector
|
import snowflake.connector
|
||||||
except ImportError as ex:
|
except ImportError as ex:
|
||||||
raise ValueError(
|
raise ImportError(
|
||||||
"Could not import snowflake-connector-python package. "
|
"Could not import snowflake-connector-python package. "
|
||||||
"Please install it with `pip install snowflake-connector-python`."
|
"Please install it with `pip install snowflake-connector-python`."
|
||||||
) from ex
|
) from ex
|
||||||
|
@ -23,6 +23,12 @@ class SpreedlyLoader(BaseLoader):
|
|||||||
"""Loader that fetches data from Spreedly API."""
|
"""Loader that fetches data from Spreedly API."""
|
||||||
|
|
||||||
def __init__(self, access_token: str, resource: str) -> None:
|
def __init__(self, access_token: str, resource: str) -> None:
|
||||||
|
"""Initialize with an access token and a resource.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
access_token: The access token.
|
||||||
|
resource: The resource.
|
||||||
|
"""
|
||||||
self.access_token = access_token
|
self.access_token = access_token
|
||||||
self.resource = resource
|
self.resource = resource
|
||||||
self.headers = {
|
self.headers = {
|
||||||
|
@ -9,7 +9,7 @@ class SRTLoader(BaseLoader):
|
|||||||
"""Loader for .srt (subtitle) files."""
|
"""Loader for .srt (subtitle) files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with file path."""
|
"""Initialize with a file path."""
|
||||||
try:
|
try:
|
||||||
import pysrt # noqa:F401
|
import pysrt # noqa:F401
|
||||||
except ImportError:
|
except ImportError:
|
||||||
|
@ -21,6 +21,12 @@ class StripeLoader(BaseLoader):
|
|||||||
"""Loader that fetches data from Stripe."""
|
"""Loader that fetches data from Stripe."""
|
||||||
|
|
||||||
def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
|
def __init__(self, resource: str, access_token: Optional[str] = None) -> None:
|
||||||
|
"""Initialize with a resource and an access token.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
resource: The resource.
|
||||||
|
access_token: The access token.
|
||||||
|
"""
|
||||||
self.resource = resource
|
self.resource = resource
|
||||||
access_token = access_token or get_from_env(
|
access_token = access_token or get_from_env(
|
||||||
"access_token", "STRIPE_ACCESS_TOKEN"
|
"access_token", "STRIPE_ACCESS_TOKEN"
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Telegram chat json dump."""
|
"""Loads Telegram chat json dump."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
@ -24,10 +24,10 @@ def concatenate_rows(row: dict) -> str:
|
|||||||
|
|
||||||
|
|
||||||
class TelegramChatFileLoader(BaseLoader):
|
class TelegramChatFileLoader(BaseLoader):
|
||||||
"""Loader that loads Telegram chat json directory dump."""
|
"""Loads Telegram chat json directory dump."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with path."""
|
"""Initialize with a path."""
|
||||||
self.file_path = path
|
self.file_path = path
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
@ -79,7 +79,7 @@ def text_to_docs(text: Union[str, List[str]]) -> List[Document]:
|
|||||||
|
|
||||||
|
|
||||||
class TelegramChatApiLoader(BaseLoader):
|
class TelegramChatApiLoader(BaseLoader):
|
||||||
"""Loader that loads Telegram chat json directory dump."""
|
"""Loads Telegram chat json directory dump."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -89,7 +89,16 @@ class TelegramChatApiLoader(BaseLoader):
|
|||||||
username: Optional[str] = None,
|
username: Optional[str] = None,
|
||||||
file_path: str = "telegram_data.json",
|
file_path: str = "telegram_data.json",
|
||||||
):
|
):
|
||||||
"""Initialize with API parameters."""
|
"""Initialize with API parameters.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
chat_entity: The chat entity to fetch data from.
|
||||||
|
api_id: The API ID.
|
||||||
|
api_hash: The API hash.
|
||||||
|
username: The username.
|
||||||
|
file_path: The file path to save the data to. Defaults to
|
||||||
|
"telegram_data.json".
|
||||||
|
"""
|
||||||
self.chat_entity = chat_entity
|
self.chat_entity = chat_entity
|
||||||
self.api_id = api_id
|
self.api_id = api_id
|
||||||
self.api_hash = api_hash
|
self.api_hash = api_hash
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads HTML to markdown using 2markdown."""
|
"""Loads HTML to markdown using 2markdown."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import Iterator, List
|
from typing import Iterator, List
|
||||||
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ToMarkdownLoader(BaseLoader):
|
class ToMarkdownLoader(BaseLoader):
|
||||||
"""Loader that loads HTML to markdown using 2markdown."""
|
"""Loads HTML to markdown using 2markdown."""
|
||||||
|
|
||||||
def __init__(self, url: str, api_key: str):
|
def __init__(self, url: str, api_key: str):
|
||||||
"""Initialize with url and api key."""
|
"""Initialize with url and api key."""
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads cards from Trello"""
|
"""Loads cards from Trello"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple
|
from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple
|
||||||
|
@ -12,7 +12,7 @@ def concatenate_rows(date: str, sender: str, text: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
class WhatsAppChatLoader(BaseLoader):
|
class WhatsAppChatLoader(BaseLoader):
|
||||||
"""Loader that loads WhatsApp messages text file."""
|
"""Loads WhatsApp messages text file."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with path."""
|
"""Initialize with path."""
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads word documents."""
|
"""Loads word documents."""
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from abc import ABC
|
from abc import ABC
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads Microsoft Excel files."""
|
"""Loads Microsoft Excel files."""
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
"""Loader that loads YouTube transcript."""
|
"""Loads YouTube transcript."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@ -140,7 +140,7 @@ def _parse_video_id(url: str) -> Optional[str]:
|
|||||||
|
|
||||||
|
|
||||||
class YoutubeLoader(BaseLoader):
|
class YoutubeLoader(BaseLoader):
|
||||||
"""Loader that loads Youtube transcripts."""
|
"""Loads Youtube transcripts."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -252,7 +252,7 @@ class YoutubeLoader(BaseLoader):
|
|||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class GoogleApiYoutubeLoader(BaseLoader):
|
class GoogleApiYoutubeLoader(BaseLoader):
|
||||||
"""Loader that loads all Videos from a Channel
|
"""Loads all Videos from a Channel
|
||||||
|
|
||||||
To use, you should have the ``googleapiclient,youtube_transcript_api``
|
To use, you should have the ``googleapiclient,youtube_transcript_api``
|
||||||
python package installed.
|
python package installed.
|
||||||
|
Loading…
Reference in New Issue
Block a user