docstrings document_loaders 1 (#6847)

- Updated docstrings in `document_loaders`
- several code fixes.
- added `docs/extras/ecosystem/integrations/airtable.md`

@rlancemartin, @eyurtsev
This commit is contained in:
Leonid Ganeline 2023-07-02 12:13:04 -07:00 committed by GitHub
parent e41b382e1c
commit 77ae8084a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
19 changed files with 127 additions and 51 deletions

View File

@ -0,0 +1,28 @@
# Airtable
>[Airtable](https://en.wikipedia.org/wiki/Airtable) is a cloud collaboration service.
`Airtable` is a spreadsheet-database hybrid, with the features of a database but applied to a spreadsheet.
> The fields in an Airtable table are similar to cells in a spreadsheet, but have types such as 'checkbox',
> 'phone number', and 'drop-down list', and can reference file attachments like images.
>Users can create a database, set up column types, add records, link tables to one another, collaborate, sort records
> and publish views to external websites.
## Installation and Setup
```bash
pip install pyairtable
```
* Get your [API key](https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens).
* Get the [ID of your base](https://airtable.com/developers/web/api/introduction).
* Get the [table ID from the table url](https://www.highviewapps.com/kb/where-can-i-find-the-airtable-base-id-and-table-id/#:~:text=Both%20the%20Airtable%20Base%20ID,URL%20that%20begins%20with%20tbl).
## Document Loader
```python
from langchain.document_loaders import AirtableLoader
```
See an [example](/docs/modules/data_connection/document_loaders/integrations/airtable.html).

View File

@ -134,7 +134,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.9.16" "version": "3.10.6"
} }
}, },
"nbformat": 4, "nbformat": 4,

View File

@ -145,10 +145,10 @@ from langchain.document_loaders.youtube import (
YoutubeLoader, YoutubeLoader,
) )
# Legacy: only for backwards compat. Use PyPDFLoader instead # Legacy: only for backwards compatibility. Use PyPDFLoader instead
PagedPDFSplitter = PyPDFLoader PagedPDFSplitter = PyPDFLoader
# For backwards compatability # For backwards compatibility
TelegramChatLoader = TelegramChatFileLoader TelegramChatLoader = TelegramChatFileLoader
__all__ = [ __all__ = [

View File

@ -8,15 +8,20 @@ from langchain.document_loaders.base import BaseLoader
class AcreomLoader(BaseLoader): class AcreomLoader(BaseLoader):
"""Loader that loads acreom vault from a directory."""
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
"""Regex to match front matter metadata in markdown files."""
def __init__( def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
): ):
"""Initialize with path."""
self.file_path = path self.file_path = path
"""Path to the directory containing the markdown files."""
self.encoding = encoding self.encoding = encoding
"""Encoding to use when reading the files."""
self.collect_metadata = collect_metadata self.collect_metadata = collect_metadata
"""Whether to collect metadata from the front matter."""
def _parse_front_matter(self, content: str) -> dict: def _parse_front_matter(self, content: str) -> dict:
"""Parse front matter metadata from the content and return it as a dict.""" """Parse front matter metadata from the content and return it as a dict."""

View File

@ -11,11 +11,11 @@ class AirbyteJSONLoader(BaseLoader):
"""Loader that loads local airbyte json files.""" """Loader that loads local airbyte json files."""
def __init__(self, file_path: str): def __init__(self, file_path: str):
"""Initialize with file path. This should start with '/tmp/airbyte_local/'.""" """Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
self.file_path = file_path self.file_path = file_path
"""Path to the directory containing the json files."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load file."""
text = "" text = ""
for line in open(self.file_path, "r"): for line in open(self.file_path, "r"):
data = json.loads(line)["_airbyte_data"] data = json.loads(line)["_airbyte_data"]

View File

@ -10,11 +10,14 @@ class AirtableLoader(BaseLoader):
def __init__(self, api_token: str, table_id: str, base_id: str): def __init__(self, api_token: str, table_id: str, base_id: str):
"""Initialize with API token and the IDs for table and base""" """Initialize with API token and the IDs for table and base"""
self.api_token = api_token self.api_token = api_token
"""Airtable API token."""
self.table_id = table_id self.table_id = table_id
"""Airtable table ID."""
self.base_id = base_id self.base_id = base_id
"""Airtable base ID."""
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]:
"""Lazy load records from table.""" """Lazy load Documents from table."""
from pyairtable import Table from pyairtable import Table
@ -32,5 +35,5 @@ class AirtableLoader(BaseLoader):
) )
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load Table.""" """Load Documents from table."""
return list(self.lazy_load()) return list(self.lazy_load())

View File

@ -1,4 +1,3 @@
"""Logic for loading documents from Apify datasets."""
from typing import Any, Callable, Dict, List from typing import Any, Callable, Dict, List
from pydantic import BaseModel, root_validator from pydantic import BaseModel, root_validator
@ -8,9 +7,10 @@ from langchain.document_loaders.base import BaseLoader
class ApifyDatasetLoader(BaseLoader, BaseModel): class ApifyDatasetLoader(BaseLoader, BaseModel):
"""Logic for loading documents from Apify datasets.""" """Loading Documents from Apify datasets."""
apify_client: Any apify_client: Any
"""An instance of the ApifyClient class from the apify-client Python package."""
dataset_id: str dataset_id: str
"""The ID of the dataset on the Apify platform.""" """The ID of the dataset on the Apify platform."""
dataset_mapping_function: Callable[[Dict], Document] dataset_mapping_function: Callable[[Dict], Document]
@ -34,7 +34,11 @@ class ApifyDatasetLoader(BaseLoader, BaseModel):
@root_validator() @root_validator()
def validate_environment(cls, values: Dict) -> Dict: def validate_environment(cls, values: Dict) -> Dict:
"""Validate environment.""" """Validate environment.
Args:
values: The values to validate.
"""
try: try:
from apify_client import ApifyClient from apify_client import ApifyClient

View File

@ -19,8 +19,11 @@ class ArxivLoader(BaseLoader):
load_all_available_meta: Optional[bool] = False, load_all_available_meta: Optional[bool] = False,
): ):
self.query = query self.query = query
"""The query to be passed to the arxiv.org API."""
self.load_max_docs = load_max_docs self.load_max_docs = load_max_docs
"""The maximum number of documents to load."""
self.load_all_available_meta = load_all_available_meta self.load_all_available_meta = load_all_available_meta
"""Whether to load all available metadata."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
arxiv_client = ArxivAPIWrapper( arxiv_client = ArxivAPIWrapper(

View File

@ -9,7 +9,7 @@ class AZLyricsLoader(WebBaseLoader):
"""Loader that loads AZLyrics webpages.""" """Loader that loads AZLyrics webpages."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load webpage.""" """Load webpages into Documents."""
soup = self.scrape() soup = self.scrape()
title = soup.title.text title = soup.title.text
lyrics = soup.find_all("div", {"class": ""})[2].text lyrics = soup.find_all("div", {"class": ""})[2].text

View File

@ -9,20 +9,23 @@ from langchain.document_loaders.base import BaseLoader
class AzureBlobStorageContainerLoader(BaseLoader): class AzureBlobStorageContainerLoader(BaseLoader):
"""Loading logic for loading documents from Azure Blob Storage.""" """Loading Documents from Azure Blob Storage."""
def __init__(self, conn_str: str, container: str, prefix: str = ""): def __init__(self, conn_str: str, container: str, prefix: str = ""):
"""Initialize with connection string, container and blob prefix.""" """Initialize with connection string, container and blob prefix."""
self.conn_str = conn_str self.conn_str = conn_str
"""Connection string for Azure Blob Storage."""
self.container = container self.container = container
"""Container name."""
self.prefix = prefix self.prefix = prefix
"""Prefix for blob names."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
try: try:
from azure.storage.blob import ContainerClient from azure.storage.blob import ContainerClient
except ImportError as exc: except ImportError as exc:
raise ValueError( raise ImportError(
"Could not import azure storage blob python package. " "Could not import azure storage blob python package. "
"Please install it with `pip install azure-storage-blob`." "Please install it with `pip install azure-storage-blob`."
) from exc ) from exc

View File

@ -1,4 +1,3 @@
"""Loading logic for loading documents from an Azure Blob Storage file."""
import os import os
import tempfile import tempfile
from typing import List from typing import List
@ -9,20 +8,23 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
class AzureBlobStorageFileLoader(BaseLoader): class AzureBlobStorageFileLoader(BaseLoader):
"""Loading logic for loading documents from Azure Blob Storage.""" """Loading Documents from Azure Blob Storage."""
def __init__(self, conn_str: str, container: str, blob_name: str): def __init__(self, conn_str: str, container: str, blob_name: str):
"""Initialize with connection string, container and blob name.""" """Initialize with connection string, container and blob name."""
self.conn_str = conn_str self.conn_str = conn_str
"""Connection string for Azure Blob Storage."""
self.container = container self.container = container
"""Container name."""
self.blob = blob_name self.blob = blob_name
"""Blob name."""
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
try: try:
from azure.storage.blob import BlobClient from azure.storage.blob import BlobClient
except ImportError as exc: except ImportError as exc:
raise ValueError( raise ImportError(
"Could not import azure storage blob python package. " "Could not import azure storage blob python package. "
"Please install it with `pip install azure-storage-blob`." "Please install it with `pip install azure-storage-blob`."
) from exc ) from exc

View File

@ -8,10 +8,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
class BaseLoader(ABC): class BaseLoader(ABC):
"""Interface for loading documents. """Interface for loading Documents.
Implementations should implement the lazy-loading method using generators Implementations should implement the lazy-loading method using generators
to avoid loading all documents into memory at once. to avoid loading all Documents into memory at once.
The `load` method will remain as is for backwards compatibility, but its The `load` method will remain as is for backwards compatibility, but its
implementation should be just `list(self.lazy_load())`. implementation should be just `list(self.lazy_load())`.
@ -22,12 +22,20 @@ class BaseLoader(ABC):
# This method returns a List which is materialized in memory. # This method returns a List which is materialized in memory.
@abstractmethod @abstractmethod
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load data into document objects.""" """Load data into Document objects."""
def load_and_split( def load_and_split(
self, text_splitter: Optional[TextSplitter] = None self, text_splitter: Optional[TextSplitter] = None
) -> List[Document]: ) -> List[Document]:
"""Load documents and split into chunks.""" """Load Documents and split into chunks. Chunks are returned as Documents.
Args:
text_splitter: TextSplitter instance to use for splitting documents.
Defaults to RecursiveCharacterTextSplitter.
Returns:
List of Documents.
"""
if text_splitter is None: if text_splitter is None:
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter() _text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
else: else:
@ -40,7 +48,7 @@ class BaseLoader(ABC):
def lazy_load( def lazy_load(
self, self,
) -> Iterator[Document]: ) -> Iterator[Document]:
"""A lazy loader for document content.""" """A lazy loader for Documents."""
raise NotImplementedError( raise NotImplementedError(
f"{self.__class__.__name__} does not implement lazy_load()" f"{self.__class__.__name__} does not implement lazy_load()"
) )
@ -49,7 +57,7 @@ class BaseLoader(ABC):
class BaseBlobParser(ABC): class BaseBlobParser(ABC):
"""Abstract interface for blob parsers. """Abstract interface for blob parsers.
A blob parser is provides a way to parse raw data stored in a blob into one A blob parser provides a way to parse raw data stored in a blob into one
or more documents. or more documents.
The parser can be composed with blob loaders, making it easy to re-use The parser can be composed with blob loaders, making it easy to re-use

View File

@ -34,8 +34,12 @@ class BibtexLoader(BaseLoader):
Args: Args:
file_path: Path to the bibtex file. file_path: Path to the bibtex file.
parser: The parser to use. If None, a default parser is used.
max_docs: Max number of associated documents to load. Use -1 means max_docs: Max number of associated documents to load. Use -1 means
no limit. no limit.
max_content_chars: Maximum number of characters to load from the PDF.
load_extra_metadata: Whether to load extra metadata from the PDF.
file_pattern: Regex pattern to match the file name in the bibtex.
""" """
self.file_path = file_path self.file_path = file_path
self.parser = parser or BibtexparserWrapper() self.parser = parser or BibtexparserWrapper()
@ -70,9 +74,7 @@ class BibtexLoader(BaseLoader):
def lazy_load(self) -> Iterator[Document]: def lazy_load(self) -> Iterator[Document]:
"""Load bibtex file using bibtexparser and get the article texts plus the """Load bibtex file using bibtexparser and get the article texts plus the
article metadata. article metadata.
See https://bibtexparser.readthedocs.io/en/master/ See https://bibtexparser.readthedocs.io/en/master/
Returns: Returns:

View File

@ -37,7 +37,7 @@ class BigQueryLoader(BaseLoader):
metadata_columns: Optional. The columns to write into the `metadata` of the metadata_columns: Optional. The columns to write into the `metadata` of the
document. document.
credentials : google.auth.credentials.Credentials, optional credentials : google.auth.credentials.Credentials, optional
Credentials for accessing Google APIs. Use this parameter to override Credentials for accessing Google APIs. Use this parameter to override
default credentials, such as to use Compute Engine default credentials, such as to use Compute Engine
(`google.auth.compute_engine.Credentials`) or Service Account (`google.auth.compute_engine.Credentials`) or Service Account
(`google.oauth2.service_account.Credentials`) credentials directly. (`google.oauth2.service_account.Credentials`) credentials directly.
@ -52,7 +52,7 @@ class BigQueryLoader(BaseLoader):
try: try:
from google.cloud import bigquery from google.cloud import bigquery
except ImportError as ex: except ImportError as ex:
raise ValueError( raise ImportError(
"Could not import google-cloud-bigquery python package. " "Could not import google-cloud-bigquery python package. "
"Please install it with `pip install google-cloud-bigquery`." "Please install it with `pip install google-cloud-bigquery`."
) from ex ) from ex

View File

@ -13,11 +13,15 @@ class BiliBiliLoader(BaseLoader):
"""Loader that loads bilibili transcripts.""" """Loader that loads bilibili transcripts."""
def __init__(self, video_urls: List[str]): def __init__(self, video_urls: List[str]):
"""Initialize with bilibili url.""" """Initialize with bilibili url.
Args:
video_urls: List of bilibili urls.
"""
self.video_urls = video_urls self.video_urls = video_urls
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load from bilibili url.""" """Load Documents from bilibili url."""
results = [] results = []
for url in self.video_urls: for url in self.video_urls:
transcript, video_info = self._get_bilibili_subs_and_info(url) transcript, video_info = self._get_bilibili_subs_and_info(url)
@ -30,7 +34,7 @@ class BiliBiliLoader(BaseLoader):
try: try:
from bilibili_api import sync, video from bilibili_api import sync, video
except ImportError: except ImportError:
raise ValueError( raise ImportError(
"requests package not found, please install it with " "requests package not found, please install it with "
"`pip install bilibili-api-python`" "`pip install bilibili-api-python`"
) )

View File

@ -12,7 +12,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
class BlackboardLoader(WebBaseLoader): class BlackboardLoader(WebBaseLoader):
"""Loader that loads all documents from a Blackboard course. """Loads all documents from a Blackboard course.
This loader is not compatible with all Blackboard courses. It is only This loader is not compatible with all Blackboard courses. It is only
compatible with courses that use the new Blackboard interface. compatible with courses that use the new Blackboard interface.
@ -34,8 +34,11 @@ class BlackboardLoader(WebBaseLoader):
""" """
base_url: str base_url: str
"""Base url of the blackboard course."""
folder_path: str folder_path: str
"""Path to the folder containing the documents."""
load_all_recursively: bool load_all_recursively: bool
"""If True, load all documents recursively."""
def __init__( def __init__(
self, self,
@ -64,7 +67,7 @@ class BlackboardLoader(WebBaseLoader):
try: try:
self.base_url = blackboard_course_url.split("/webapps/blackboard")[0] self.base_url = blackboard_course_url.split("/webapps/blackboard")[0]
except IndexError: except IndexError:
raise ValueError( raise IndexError(
"Invalid blackboard course url. " "Invalid blackboard course url. "
"Please provide a url that starts with " "Please provide a url that starts with "
"https://<blackboard_url>/webapps/blackboard" "https://<blackboard_url>/webapps/blackboard"
@ -94,10 +97,10 @@ class BlackboardLoader(WebBaseLoader):
) )
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load data into document objects. """Load data into Document objects.
Returns: Returns:
List of documents. List of Documents.
""" """
if self.load_all_recursively: if self.load_all_recursively:
soup_info = self.scrape() soup_info = self.scrape()
@ -118,7 +121,7 @@ class BlackboardLoader(WebBaseLoader):
return self._get_documents(soup_info) return self._get_documents(soup_info)
def _get_folder_path(self, soup: Any) -> str: def _get_folder_path(self, soup: Any) -> str:
"""Get the folder path to save the documents in. """Get the folder path to save the Documents in.
Args: Args:
soup: BeautifulSoup4 soup object. soup: BeautifulSoup4 soup object.
@ -229,7 +232,7 @@ class BlackboardLoader(WebBaseLoader):
return relative_paths return relative_paths
def download(self, path: str) -> None: def download(self, path: str) -> None:
"""Download a file from a url. """Download a file from an url.
Args: Args:
path: Path to the file. path: Path to the file.
@ -243,7 +246,7 @@ class BlackboardLoader(WebBaseLoader):
f.write(response.content) f.write(response.content)
def parse_filename(self, url: str) -> str: def parse_filename(self, url: str) -> str:
"""Parse the filename from a url. """Parse the filename from an url.
Args: Args:
url: Url to parse the filename from. url: Url to parse the filename from.
@ -257,7 +260,7 @@ class BlackboardLoader(WebBaseLoader):
return self._parse_filename_from_url(url) return self._parse_filename_from_url(url)
def _parse_filename_from_url(self, url: str) -> str: def _parse_filename_from_url(self, url: str) -> str:
"""Parse the filename from a url. """Parse the filename from an url.
Args: Args:
url: Url to parse the filename from. url: Url to parse the filename from.

View File

@ -55,6 +55,16 @@ class BlockchainDocumentLoader(BaseLoader):
get_all_tokens: bool = False, get_all_tokens: bool = False,
max_execution_time: Optional[int] = None, max_execution_time: Optional[int] = None,
): ):
"""
Args:
contract_address: The address of the smart contract.
blockchainType: The blockchain type.
api_key: The Alchemy API key.
startToken: The start token for pagination.
get_all_tokens: Whether to get all tokens on the contract.
max_execution_time: The maximum execution time (sec).
"""
self.contract_address = contract_address self.contract_address = contract_address
self.blockchainType = blockchainType.value self.blockchainType = blockchainType.value
self.api_key = os.environ.get("ALCHEMY_API_KEY") or api_key self.api_key = os.environ.get("ALCHEMY_API_KEY") or api_key

View File

@ -1,4 +1,3 @@
"""Load conversations from ChatGPT data export"""
import datetime import datetime
import json import json
from typing import List from typing import List
@ -29,9 +28,15 @@ def concatenate_rows(message: dict, title: str) -> str:
class ChatGPTLoader(BaseLoader): class ChatGPTLoader(BaseLoader):
"""Loader that loads conversations from exported ChatGPT data.""" """Load conversations from exported ChatGPT data."""
def __init__(self, log_file: str, num_logs: int = -1): def __init__(self, log_file: str, num_logs: int = -1):
"""
Args:
log_file: Path to the log file
num_logs: Number of logs to load. If 0, load all logs.
"""
self.log_file = log_file self.log_file = log_file
self.num_logs = num_logs self.num_logs = num_logs

View File

@ -284,9 +284,7 @@
" error=False, # Only runs that succeed\n", " error=False, # Only runs that succeed\n",
")\n", ")\n",
"for run in runs:\n", "for run in runs:\n",
" client.create_example(\n", " client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)"
" inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n",
" )"
] ]
}, },
{ {
@ -333,7 +331,7 @@
"eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n", "eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
"\n", "\n",
"# Measures accuracy against ground truth\n", "# Measures accuracy against ground truth\n",
"qa_evaluator = get_qa_evaluator(eval_llm) \n", "qa_evaluator = get_qa_evaluator(eval_llm)\n",
"\n", "\n",
"# Measures how effective and efficient the agent's actions are\n", "# Measures how effective and efficient the agent's actions are\n",
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
@ -392,13 +390,13 @@
"llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n", "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
"\n", "\n",
"\n",
"# Since chains can be stateful (e.g. they can have memory), we need provide\n", "# Since chains can be stateful (e.g. they can have memory), we need provide\n",
"# a way to initialize a new chain for each row in the dataset. This is done\n", "# a way to initialize a new chain for each row in the dataset. This is done\n",
"# by passing in a factory function that returns a new chain for each row.\n", "# by passing in a factory function that returns a new chain for each row.\n",
"def agent_factory():\n", "def agent_factory():\n",
" return initialize_agent(\n", " return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
" tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n", "\n",
")\n",
"\n", "\n",
"# If your chain is NOT stateful, your factory can return the object directly\n", "# If your chain is NOT stateful, your factory can return the object directly\n",
"# to improve runtime performance. For example:\n", "# to improve runtime performance. For example:\n",
@ -477,7 +475,7 @@
"source": [ "source": [
"from langchain.client import (\n", "from langchain.client import (\n",
" arun_on_dataset,\n", " arun_on_dataset,\n",
" run_on_dataset, # Available if your chain doesn't support async calls.\n", " run_on_dataset, # Available if your chain doesn't support async calls.\n",
")\n", ")\n",
"\n", "\n",
"?arun_on_dataset" "?arun_on_dataset"
@ -616,9 +614,7 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"agent = initialize_agent(\n", "agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)"
" tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
")"
] ]
}, },
{ {