mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-27 17:08:47 +00:00
docstrings: document_loaders consitency (#9139)
Formatted docstrings from different formats to consistent format, lile: >Loads processed docs from Docugami. "Load from `Docugami`." >Loader that uses Unstructured to load HTML files. "Load `HTML` files using `Unstructured`." >Load documents from a directory. "Load from a directory." - `Load` - no `Loads` - DocumentLoader always loads Documents, so no more "documents/docs/texts/ etc" - integrated systems and APIs enclosed in backticks,
This commit is contained in:
parent
0aabded97f
commit
edb585228d
@ -1,4 +1,3 @@
|
|||||||
"""Loads acreom vault from a directory."""
|
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, List
|
from typing import Iterator, List
|
||||||
@ -8,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class AcreomLoader(BaseLoader):
|
class AcreomLoader(BaseLoader):
|
||||||
"""Loader that loads acreom vault from a directory."""
|
"""Load `acreom` vault from a directory."""
|
||||||
|
|
||||||
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
||||||
"""Regex to match front matter metadata in markdown files."""
|
"""Regex to match front matter metadata in markdown files."""
|
||||||
@ -16,6 +15,7 @@ class AcreomLoader(BaseLoader):
|
|||||||
def __init__(
|
def __init__(
|
||||||
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
|
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
|
||||||
):
|
):
|
||||||
|
"""Initialize the loader."""
|
||||||
self.file_path = path
|
self.file_path = path
|
||||||
"""Path to the directory containing the markdown files."""
|
"""Path to the directory containing the markdown files."""
|
||||||
self.encoding = encoding
|
self.encoding = encoding
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads local airbyte json files."""
|
|
||||||
from typing import Any, Callable, Iterator, List, Mapping, Optional
|
from typing import Any, Callable, Iterator, List, Mapping, Optional
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -9,7 +8,7 @@ RecordHandler = Callable[[Any, Optional[str]], Document]
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteCDKLoader(BaseLoader):
|
class AirbyteCDKLoader(BaseLoader):
|
||||||
"""Loads records using an Airbyte source connector implemented using the CDK."""
|
"""Load with an `Airbyte` source connector implemented using the `CDK`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads local airbyte json files."""
|
|
||||||
import json
|
import json
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -8,7 +7,7 @@ from langchain.utils import stringify_dict
|
|||||||
|
|
||||||
|
|
||||||
class AirbyteJSONLoader(BaseLoader):
|
class AirbyteJSONLoader(BaseLoader):
|
||||||
"""Loads local airbyte json files."""
|
"""Load local `Airbyte` json files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
|
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
|
||||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class AirtableLoader(BaseLoader):
|
class AirtableLoader(BaseLoader):
|
||||||
"""Loader for Airtable tables."""
|
"""Load the `Airtable` tables."""
|
||||||
|
|
||||||
def __init__(self, api_token: str, table_id: str, base_id: str):
|
def __init__(self, api_token: str, table_id: str, base_id: str):
|
||||||
"""Initialize with API token and the IDs for table and base"""
|
"""Initialize with API token and the IDs for table and base"""
|
||||||
|
@ -7,7 +7,8 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class ApifyDatasetLoader(BaseLoader, BaseModel):
|
class ApifyDatasetLoader(BaseLoader, BaseModel):
|
||||||
"""Loads datasets from Apify-a web scraping, crawling, and data extraction platform.
|
"""Load datasets from `Apify` web scraping, crawling, and data extraction platform.
|
||||||
|
|
||||||
For details, see https://docs.apify.com/platform/integrations/langchain
|
For details, see https://docs.apify.com/platform/integrations/langchain
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
@ -6,7 +6,7 @@ from langchain.utilities.arxiv import ArxivAPIWrapper
|
|||||||
|
|
||||||
|
|
||||||
class ArxivLoader(BaseLoader):
|
class ArxivLoader(BaseLoader):
|
||||||
"""Loads a query result from arxiv.org into a list of Documents.
|
"""Load a query result from `Arxiv`.
|
||||||
|
|
||||||
The loader converts the original PDF format into the text.
|
The loader converts the original PDF format into the text.
|
||||||
"""
|
"""
|
||||||
|
@ -24,7 +24,7 @@ default_header_template = {
|
|||||||
|
|
||||||
|
|
||||||
class AsyncHtmlLoader(BaseLoader):
|
class AsyncHtmlLoader(BaseLoader):
|
||||||
"""Loads HTML asynchronously."""
|
"""Load `HTML` asynchronously."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@ -36,7 +36,7 @@ class AsyncHtmlLoader(BaseLoader):
|
|||||||
requests_kwargs: Dict[str, Any] = {},
|
requests_kwargs: Dict[str, Any] = {},
|
||||||
raise_for_status: bool = False,
|
raise_for_status: bool = False,
|
||||||
):
|
):
|
||||||
"""Initialize with webpage path."""
|
"""Initialize with a webpage path."""
|
||||||
|
|
||||||
# TODO: Deprecate web_path in favor of web_paths, and remove this
|
# TODO: Deprecate web_path in favor of web_paths, and remove this
|
||||||
# left like this because there are a number of loaders that expect single
|
# left like this because there are a number of loaders that expect single
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads AZLyrics."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class AZLyricsLoader(WebBaseLoader):
|
class AZLyricsLoader(WebBaseLoader):
|
||||||
"""Loads AZLyrics webpages."""
|
"""Load `AZLyrics` webpages."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load webpages into Documents."""
|
"""Load webpages into Documents."""
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loading logic for loading documents from an Azure Blob Storage container."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -9,7 +8,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class AzureBlobStorageContainerLoader(BaseLoader):
|
class AzureBlobStorageContainerLoader(BaseLoader):
|
||||||
"""Loading Documents from Azure Blob Storage."""
|
"""Load from `Azure Blob Storage` container."""
|
||||||
|
|
||||||
def __init__(self, conn_str: str, container: str, prefix: str = ""):
|
def __init__(self, conn_str: str, container: str, prefix: str = ""):
|
||||||
"""Initialize with connection string, container and blob prefix."""
|
"""Initialize with connection string, container and blob prefix."""
|
||||||
|
@ -8,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class AzureBlobStorageFileLoader(BaseLoader):
|
class AzureBlobStorageFileLoader(BaseLoader):
|
||||||
"""Loading Documents from Azure Blob Storage."""
|
"""Load from `Azure Blob Storage` files."""
|
||||||
|
|
||||||
def __init__(self, conn_str: str, container: str, blob_name: str):
|
def __init__(self, conn_str: str, container: str, blob_name: str):
|
||||||
"""Initialize with connection string, container and blob name."""
|
"""Initialize with connection string, container and blob name."""
|
||||||
|
@ -8,7 +8,7 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
|||||||
|
|
||||||
|
|
||||||
class BaseLoader(ABC):
|
class BaseLoader(ABC):
|
||||||
"""Interface for loading Documents.
|
"""Interface for Document Loader.
|
||||||
|
|
||||||
Implementations should implement the lazy-loading method using generators
|
Implementations should implement the lazy-loading method using generators
|
||||||
to avoid loading all Documents into memory at once.
|
to avoid loading all Documents into memory at once.
|
||||||
|
@ -11,7 +11,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class BibtexLoader(BaseLoader):
|
class BibtexLoader(BaseLoader):
|
||||||
"""Loads a bibtex file into a list of Documents.
|
"""Load a `bibtex` file.
|
||||||
|
|
||||||
Each document represents one entry from the bibtex file.
|
Each document represents one entry from the bibtex file.
|
||||||
|
|
||||||
|
@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class BigQueryLoader(BaseLoader):
|
class BigQueryLoader(BaseLoader):
|
||||||
"""Loads a query result from BigQuery into a list of documents.
|
"""Load from the Google Cloud Platform `BigQuery`.
|
||||||
|
|
||||||
Each document represents one row of the result. The `page_content_columns`
|
Each document represents one row of the result. The `page_content_columns`
|
||||||
are written into the `page_content` of the document. The `metadata_columns`
|
are written into the `page_content` of the document. The `metadata_columns`
|
||||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class BiliBiliLoader(BaseLoader):
|
class BiliBiliLoader(BaseLoader):
|
||||||
"""Loads bilibili transcripts."""
|
"""Load `BiliBili` video transcripts."""
|
||||||
|
|
||||||
def __init__(self, video_urls: List[str]):
|
def __init__(self, video_urls: List[str]):
|
||||||
"""Initialize with bilibili url.
|
"""Initialize with bilibili url.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads all documents from a blackboard course."""
|
|
||||||
import contextlib
|
import contextlib
|
||||||
import re
|
import re
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -12,7 +11,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class BlackboardLoader(WebBaseLoader):
|
class BlackboardLoader(WebBaseLoader):
|
||||||
"""Loads all documents from a Blackboard course.
|
"""Load a `Blackboard` course.
|
||||||
|
|
||||||
This loader is not compatible with all Blackboard courses. It is only
|
This loader is not compatible with all Blackboard courses. It is only
|
||||||
compatible with courses that use the new Blackboard interface.
|
compatible with courses that use the new Blackboard interface.
|
||||||
|
@ -20,7 +20,7 @@ class BlockchainType(Enum):
|
|||||||
|
|
||||||
|
|
||||||
class BlockchainDocumentLoader(BaseLoader):
|
class BlockchainDocumentLoader(BaseLoader):
|
||||||
"""Loads elements from a blockchain smart contract into Langchain documents.
|
"""Load elements from a blockchain smart contract.
|
||||||
|
|
||||||
The supported blockchains are: Ethereum mainnet, Ethereum Goerli testnet,
|
The supported blockchains are: Ethereum mainnet, Ethereum Goerli testnet,
|
||||||
Polygon mainnet, and Polygon Mumbai testnet.
|
Polygon mainnet, and Polygon Mumbai testnet.
|
||||||
|
@ -6,7 +6,7 @@ from langchain.utilities.brave_search import BraveSearchWrapper
|
|||||||
|
|
||||||
|
|
||||||
class BraveSearchLoader(BaseLoader):
|
class BraveSearchLoader(BaseLoader):
|
||||||
"""Loads a query result from Brave Search engine into a list of Documents."""
|
"""Load with `Brave Search` engine."""
|
||||||
|
|
||||||
def __init__(self, query: str, api_key: str, search_kwargs: Optional[dict] = None):
|
def __init__(self, query: str, api_key: str, search_kwargs: Optional[dict] = None):
|
||||||
"""Initializes the BraveLoader.
|
"""Initializes the BraveLoader.
|
||||||
|
@ -7,7 +7,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class BrowserlessLoader(BaseLoader):
|
class BrowserlessLoader(BaseLoader):
|
||||||
"""Loads the content of webpages using Browserless' /content endpoint"""
|
"""Load webpages with `Browserless` /content endpoint."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, api_token: str, urls: Union[str, List[str]], text_content: bool = True
|
self, api_token: str, urls: Union[str, List[str]], text_content: bool = True
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load conversations from ChatGPT data export"""
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
from typing import List
|
from typing import List
|
||||||
@ -29,7 +28,7 @@ def concatenate_rows(message: dict, title: str) -> str:
|
|||||||
|
|
||||||
|
|
||||||
class ChatGPTLoader(BaseLoader):
|
class ChatGPTLoader(BaseLoader):
|
||||||
"""Load conversations from exported ChatGPT data."""
|
"""Load conversations from exported `ChatGPT` data."""
|
||||||
|
|
||||||
def __init__(self, log_file: str, num_logs: int = -1):
|
def __init__(self, log_file: str, num_logs: int = -1):
|
||||||
"""Initialize a class object.
|
"""Initialize a class object.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads College Confidential."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class CollegeConfidentialLoader(WebBaseLoader):
|
class CollegeConfidentialLoader(WebBaseLoader):
|
||||||
"""Loads College Confidential webpages."""
|
"""Load `College Confidential` webpages."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Load webpages as Documents."""
|
"""Load webpages as Documents."""
|
||||||
|
@ -16,9 +16,7 @@ DEFAULT = Literal["default"]
|
|||||||
|
|
||||||
|
|
||||||
class ConcurrentLoader(GenericLoader):
|
class ConcurrentLoader(GenericLoader):
|
||||||
"""
|
"""Load and pars Documents concurrently."""
|
||||||
A generic document loader that loads and parses documents concurrently.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, blob_loader: BlobLoader, blob_parser: BaseBlobParser, num_workers: int = 4
|
self, blob_loader: BlobLoader, blob_parser: BaseBlobParser, num_workers: int = 4
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load Data from a Confluence Space"""
|
|
||||||
import logging
|
import logging
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -33,7 +32,7 @@ class ContentFormat(str, Enum):
|
|||||||
|
|
||||||
|
|
||||||
class ConfluenceLoader(BaseLoader):
|
class ConfluenceLoader(BaseLoader):
|
||||||
"""Load Confluence pages.
|
"""Load `Confluence` pages.
|
||||||
|
|
||||||
Port of https://llamahub.ai/l/confluence
|
Port of https://llamahub.ai/l/confluence
|
||||||
This currently supports username/api_key, Oauth2 login or personal access token
|
This currently supports username/api_key, Oauth2 login or personal access token
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load CoNLL-U files."""
|
|
||||||
import csv
|
import csv
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class CoNLLULoader(BaseLoader):
|
class CoNLLULoader(BaseLoader):
|
||||||
"""Load CoNLL-U files."""
|
"""Load `CoNLL-U` files."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
|
@ -10,7 +10,7 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class CSVLoader(BaseLoader):
|
class CSVLoader(BaseLoader):
|
||||||
"""Loads a CSV file into a list of documents.
|
"""Load a `CSV` file into a list of Documents.
|
||||||
|
|
||||||
Each document represents one row of the CSV file. Every row is converted into a
|
Each document represents one row of the CSV file. Every row is converted into a
|
||||||
key/value pair and outputted to a new line in the document's page_content.
|
key/value pair and outputted to a new line in the document's page_content.
|
||||||
|
@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class CubeSemanticLoader(BaseLoader):
|
class CubeSemanticLoader(BaseLoader):
|
||||||
"""Load Cube semantic layer metadata.
|
"""Load `Cube semantic layer` metadata.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cube_api_url: REST API endpoint.
|
cube_api_url: REST API endpoint.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load Datadog logs."""
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from typing import List, Optional
|
from typing import List, Optional
|
||||||
|
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class DatadogLogsLoader(BaseLoader):
|
class DatadogLogsLoader(BaseLoader):
|
||||||
"""Loads a query result from Datadog into a list of documents.
|
"""Load `Datadog` logs.
|
||||||
|
|
||||||
Logs are written into the `page_content` and into the `metadata`.
|
Logs are written into the `page_content` and into the `metadata`.
|
||||||
"""
|
"""
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load from a Dataframe object"""
|
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class DataFrameLoader(BaseLoader):
|
class DataFrameLoader(BaseLoader):
|
||||||
"""Load Pandas DataFrame."""
|
"""Load `Pandas` DataFrame."""
|
||||||
|
|
||||||
def __init__(self, data_frame: Any, page_content_column: str = "text"):
|
def __init__(self, data_frame: Any, page_content_column: str = "text"):
|
||||||
"""Initialize with dataframe object.
|
"""Initialize with dataframe object.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loader that uses Diffbot to load webpages in text format."""
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
@ -11,7 +10,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class DiffbotLoader(BaseLoader):
|
class DiffbotLoader(BaseLoader):
|
||||||
"""Loads Diffbot file json."""
|
"""Load `Diffbot` json file."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, api_token: str, urls: List[str], continue_on_failure: bool = True
|
self, api_token: str, urls: List[str], continue_on_failure: bool = True
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load documents from a directory."""
|
|
||||||
import concurrent
|
import concurrent
|
||||||
import logging
|
import logging
|
||||||
import random
|
import random
|
||||||
@ -26,7 +25,7 @@ def _is_visible(p: Path) -> bool:
|
|||||||
|
|
||||||
|
|
||||||
class DirectoryLoader(BaseLoader):
|
class DirectoryLoader(BaseLoader):
|
||||||
"""Load documents from a directory."""
|
"""Load from a directory."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load from Discord chat dump"""
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from typing import TYPE_CHECKING, List
|
from typing import TYPE_CHECKING, List
|
||||||
@ -11,7 +10,7 @@ if TYPE_CHECKING:
|
|||||||
|
|
||||||
|
|
||||||
class DiscordChatLoader(BaseLoader):
|
class DiscordChatLoader(BaseLoader):
|
||||||
"""Load Discord chat logs."""
|
"""Load `Discord` chat logs."""
|
||||||
|
|
||||||
def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"):
|
def __init__(self, chat_log: pd.DataFrame, user_id_col: str = "ID"):
|
||||||
"""Initialize with a Pandas DataFrame containing chat logs.
|
"""Initialize with a Pandas DataFrame containing chat logs.
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
"""Loads processed documents from Docugami."""
|
|
||||||
|
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@ -29,7 +27,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class DocugamiLoader(BaseLoader, BaseModel):
|
class DocugamiLoader(BaseLoader, BaseModel):
|
||||||
"""Loads processed docs from Docugami.
|
"""Load from `Docugami`.
|
||||||
|
|
||||||
To use, you should have the ``lxml`` python package installed.
|
To use, you should have the ``lxml`` python package installed.
|
||||||
"""
|
"""
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
"""Loads data from Dropbox."""
|
|
||||||
|
|
||||||
# Prerequisites:
|
# Prerequisites:
|
||||||
# 1. Create a Dropbox app.
|
# 1. Create a Dropbox app.
|
||||||
# 2. Give the app these scope permissions: `files.metadata.read`
|
# 2. Give the app these scope permissions: `files.metadata.read`
|
||||||
@ -20,7 +18,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class DropboxLoader(BaseLoader, BaseModel):
|
class DropboxLoader(BaseLoader, BaseModel):
|
||||||
"""Loads files from Dropbox.
|
"""Load files from `Dropbox`.
|
||||||
|
|
||||||
In addition to common files such as text and PDF files, it also supports
|
In addition to common files such as text and PDF files, it also supports
|
||||||
*Dropbox Paper* files.
|
*Dropbox Paper* files.
|
||||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class DuckDBLoader(BaseLoader):
|
class DuckDBLoader(BaseLoader):
|
||||||
"""Loads a query result from DuckDB into a list of documents.
|
"""Load from `DuckDB`.
|
||||||
|
|
||||||
Each document represents one row of the result. The `page_content_columns`
|
Each document represents one row of the result. The `page_content_columns`
|
||||||
are written into the `page_content` of the document. The `metadata_columns`
|
are written into the `page_content` of the document. The `metadata_columns`
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads email files."""
|
|
||||||
import os
|
import os
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
@ -11,7 +10,9 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
class UnstructuredEmailLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load email files. Works with both
|
"""Load email files with `unstructured`.
|
||||||
|
|
||||||
|
Works with both
|
||||||
.eml and .msg files. You can process attachments in addition to the
|
.eml and .msg files. You can process attachments in addition to the
|
||||||
e-mail message itself by passing process_attachments=True into the
|
e-mail message itself by passing process_attachments=True into the
|
||||||
constructor for the loader. By default, attachments will be processed
|
constructor for the loader. By default, attachments will be processed
|
||||||
|
@ -52,7 +52,7 @@ class EmbaasDocumentExtractionPayload(EmbaasDocumentExtractionParameters):
|
|||||||
|
|
||||||
|
|
||||||
class BaseEmbaasLoader(BaseModel):
|
class BaseEmbaasLoader(BaseModel):
|
||||||
"""Base class for embedding a model into an Embaas document extraction API."""
|
"""Base loader for embedding a model into an `Embaas` document extraction API."""
|
||||||
|
|
||||||
embaas_api_key: Optional[str] = None
|
embaas_api_key: Optional[str] = None
|
||||||
"""The API key for the embaas document extraction API."""
|
"""The API key for the embaas document extraction API."""
|
||||||
@ -72,7 +72,7 @@ class BaseEmbaasLoader(BaseModel):
|
|||||||
|
|
||||||
|
|
||||||
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
||||||
"""Embaas's document byte loader.
|
"""Load `Embaas` blob.
|
||||||
|
|
||||||
To use, you should have the
|
To use, you should have the
|
||||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
||||||
@ -178,7 +178,7 @@ class EmbaasBlobLoader(BaseEmbaasLoader, BaseBlobParser):
|
|||||||
|
|
||||||
|
|
||||||
class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
|
class EmbaasLoader(BaseEmbaasLoader, BaseLoader):
|
||||||
"""Embaas's document loader.
|
"""Load from `Embaas`.
|
||||||
|
|
||||||
To use, you should have the
|
To use, you should have the
|
||||||
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
environment variable ``EMBAAS_API_KEY`` set with your API key, or pass
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads EPub files."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import (
|
from langchain.document_loaders.unstructured import (
|
||||||
@ -8,7 +7,7 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredEPubLoader(UnstructuredFileLoader):
|
class UnstructuredEPubLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses Unstructured to load EPUB files.
|
"""Load `EPub` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
|
@ -9,8 +9,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class EtherscanLoader(BaseLoader):
|
class EtherscanLoader(BaseLoader):
|
||||||
"""
|
"""Load transactions from `Ethereum` mainnet.
|
||||||
Load transactions from an account on Ethereum mainnet.
|
|
||||||
|
|
||||||
The Loader use Etherscan API to interact with Ethereum mainnet.
|
The Loader use Etherscan API to interact with Ethereum mainnet.
|
||||||
|
|
||||||
|
@ -15,7 +15,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class EverNoteLoader(BaseLoader):
|
class EverNoteLoader(BaseLoader):
|
||||||
"""EverNote Loader.
|
"""Load from `EverNote`.
|
||||||
|
|
||||||
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
|
Loads an EverNote notebook export file e.g. my_notebook.enex into Documents.
|
||||||
Instructions on producing this file can be found at
|
Instructions on producing this file can be found at
|
||||||
|
@ -8,7 +8,9 @@ from langchain.document_loaders.unstructured import (
|
|||||||
|
|
||||||
|
|
||||||
class UnstructuredExcelLoader(UnstructuredFileLoader):
|
class UnstructuredExcelLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses unstructured to load Excel files. Like other
|
"""Load Microsoft Excel files using `Unstructured`.
|
||||||
|
|
||||||
|
Like other
|
||||||
Unstructured loaders, UnstructuredExcelLoader can be used in both
|
Unstructured loaders, UnstructuredExcelLoader can be used in both
|
||||||
"single" and "elements" mode. If you use the loader in "elements"
|
"single" and "elements" mode. If you use the loader in "elements"
|
||||||
mode, each sheet in the Excel file will be a an Unstructured Table
|
mode, each sheet in the Excel file will be a an Unstructured Table
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads Facebook chat json dump."""
|
|
||||||
import datetime
|
import datetime
|
||||||
import json
|
import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@ -23,7 +22,7 @@ def concatenate_rows(row: dict) -> str:
|
|||||||
|
|
||||||
|
|
||||||
class FacebookChatLoader(BaseLoader):
|
class FacebookChatLoader(BaseLoader):
|
||||||
"""Loads Facebook messages json directory dump."""
|
"""Load `Facebook Chat` messages directory dump."""
|
||||||
|
|
||||||
def __init__(self, path: str):
|
def __init__(self, path: str):
|
||||||
"""Initialize with a path."""
|
"""Initialize with a path."""
|
||||||
|
@ -5,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class FaunaLoader(BaseLoader):
|
class FaunaLoader(BaseLoader):
|
||||||
"""FaunaDB Loader.
|
"""Load from `FaunaDB`.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
query (str): The FQL query string to execute.
|
query (str): The FQL query string to execute.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads Figma files json dump."""
|
|
||||||
import json
|
import json
|
||||||
import urllib.request
|
import urllib.request
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
@ -9,7 +8,7 @@ from langchain.utils import stringify_dict
|
|||||||
|
|
||||||
|
|
||||||
class FigmaFileLoader(BaseLoader):
|
class FigmaFileLoader(BaseLoader):
|
||||||
"""Loads Figma file json."""
|
"""Load `Figma` file."""
|
||||||
|
|
||||||
def __init__(self, access_token: str, ids: str, key: str):
|
def __init__(self, access_token: str, ids: str, key: str):
|
||||||
"""Initialize with access token, ids, and key.
|
"""Initialize with access token, ids, and key.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loading logic for loading documents from an GCS directory."""
|
|
||||||
from typing import Callable, List, Optional
|
from typing import Callable, List, Optional
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class GCSDirectoryLoader(BaseLoader):
|
class GCSDirectoryLoader(BaseLoader):
|
||||||
"""Loads Documents from GCS."""
|
"""Load from GCS directory."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load documents from a GCS file."""
|
|
||||||
import os
|
import os
|
||||||
import tempfile
|
import tempfile
|
||||||
from typing import Callable, List, Optional
|
from typing import Callable, List, Optional
|
||||||
@ -9,7 +8,7 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
|||||||
|
|
||||||
|
|
||||||
class GCSFileLoader(BaseLoader):
|
class GCSFileLoader(BaseLoader):
|
||||||
"""Load Documents from a GCS file."""
|
"""Load from GCS file."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
@ -15,7 +15,7 @@ DEFAULT = Literal["default"]
|
|||||||
|
|
||||||
|
|
||||||
class GenericLoader(BaseLoader):
|
class GenericLoader(BaseLoader):
|
||||||
"""A generic document loader.
|
"""Generic Document Loader.
|
||||||
|
|
||||||
A generic document loader that allows combining an arbitrary blob loader with
|
A generic document loader that allows combining an arbitrary blob loader with
|
||||||
a blob parser.
|
a blob parser.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Load from Dataframe object"""
|
|
||||||
from typing import Any, Iterator, List
|
from typing import Any, Iterator, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class GeoDataFrameLoader(BaseLoader):
|
class GeoDataFrameLoader(BaseLoader):
|
||||||
"""Load geopandas Dataframe."""
|
"""Load `geopandas` Dataframe."""
|
||||||
|
|
||||||
def __init__(self, data_frame: Any, page_content_column: str = "geometry"):
|
def __init__(self, data_frame: Any, page_content_column: str = "geometry"):
|
||||||
"""Initialize with geopandas Dataframe.
|
"""Initialize with geopandas Dataframe.
|
||||||
|
@ -6,7 +6,8 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class GitLoader(BaseLoader):
|
class GitLoader(BaseLoader):
|
||||||
"""Loads files from a Git repository into a list of documents.
|
"""Load `Git` repository files.
|
||||||
|
|
||||||
The Repository can be local on disk available at `repo_path`,
|
The Repository can be local on disk available at `repo_path`,
|
||||||
or remote at `clone_url` that will be cloned to `repo_path`.
|
or remote at `clone_url` that will be cloned to `repo_path`.
|
||||||
Currently, supports only text files.
|
Currently, supports only text files.
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads GitBook."""
|
|
||||||
from typing import Any, List, Optional
|
from typing import Any, List, Optional
|
||||||
from urllib.parse import urljoin, urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
@ -7,7 +6,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class GitbookLoader(WebBaseLoader):
|
class GitbookLoader(WebBaseLoader):
|
||||||
"""Load GitBook data.
|
"""Load `GitBook` data.
|
||||||
|
|
||||||
1. load from either a single page, or
|
1. load from either a single page, or
|
||||||
2. load all (relative) paths in the navbar.
|
2. load all (relative) paths in the navbar.
|
||||||
|
@ -11,7 +11,7 @@ from langchain.utils import get_from_dict_or_env
|
|||||||
|
|
||||||
|
|
||||||
class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
|
class BaseGitHubLoader(BaseLoader, BaseModel, ABC):
|
||||||
"""Load issues of a GitHub repository."""
|
"""Load `GitHub` repository Issues."""
|
||||||
|
|
||||||
repo: str
|
repo: str
|
||||||
"""Name of repository"""
|
"""Name of repository"""
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
"""Loads data from Google Drive."""
|
|
||||||
|
|
||||||
# Prerequisites:
|
# Prerequisites:
|
||||||
# 1. Create a Google Cloud project
|
# 1. Create a Google Cloud project
|
||||||
# 2. Enable the Google Drive API:
|
# 2. Enable the Google Drive API:
|
||||||
@ -22,7 +20,7 @@ SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
|
|||||||
|
|
||||||
|
|
||||||
class GoogleDriveLoader(BaseLoader, BaseModel):
|
class GoogleDriveLoader(BaseLoader, BaseModel):
|
||||||
"""Loads Google Docs from Google Drive."""
|
"""Load Google Docs from `Google Drive`."""
|
||||||
|
|
||||||
service_account_key: Path = Path.home() / ".credentials" / "keys.json"
|
service_account_key: Path = Path.home() / ".credentials" / "keys.json"
|
||||||
"""Path to the service account key file."""
|
"""Path to the service account key file."""
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads .txt web files."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,7 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class GutenbergLoader(BaseLoader):
|
class GutenbergLoader(BaseLoader):
|
||||||
"""Loader that uses urllib to load .txt web files."""
|
"""Load from `Gutenberg.org`."""
|
||||||
|
|
||||||
def __init__(self, file_path: str):
|
def __init__(self, file_path: str):
|
||||||
"""Initialize with a file path."""
|
"""Initialize with a file path."""
|
||||||
|
@ -1,4 +1,3 @@
|
|||||||
"""Loads HN."""
|
|
||||||
from typing import Any, List
|
from typing import Any, List
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
@ -6,7 +5,9 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
|||||||
|
|
||||||
|
|
||||||
class HNLoader(WebBaseLoader):
|
class HNLoader(WebBaseLoader):
|
||||||
"""Load Hacker News data from either main page results or the comments page."""
|
"""Load `Hacker News` data.
|
||||||
|
|
||||||
|
It loads data from either main page results or the comments page."""
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Get important HN webpage information.
|
"""Get important HN webpage information.
|
||||||
|
@ -1,11 +1,10 @@
|
|||||||
"""Loader that uses unstructured to load HTML files."""
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||||
|
|
||||||
|
|
||||||
class UnstructuredHTMLLoader(UnstructuredFileLoader):
|
class UnstructuredHTMLLoader(UnstructuredFileLoader):
|
||||||
"""Loader that uses Unstructured to load HTML files.
|
"""Load `HTML` files using `Unstructured`.
|
||||||
|
|
||||||
You can run the loader in one of two modes: "single" and "elements".
|
You can run the loader in one of two modes: "single" and "elements".
|
||||||
If you use "single" mode, the document will be returned as a single
|
If you use "single" mode, the document will be returned as a single
|
||||||
|
@ -1,5 +1,3 @@
|
|||||||
"""Loader that uses bs4 to load HTML files, enriching metadata with page title."""
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, List, Union
|
from typing import Dict, List, Union
|
||||||
|
|
||||||
@ -10,7 +8,7 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class BSHTMLLoader(BaseLoader):
|
class BSHTMLLoader(BaseLoader):
|
||||||
"""Loader that uses beautiful soup to parse HTML files."""
|
"""Load `HTML` files and parse them with `beautiful soup`."""
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
|
Loading…
Reference in New Issue
Block a user