mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-14 07:07:34 +00:00
docstrings document_loaders
1 (#6847)
- Updated docstrings in `document_loaders` - several code fixes. - added `docs/extras/ecosystem/integrations/airtable.md` @rlancemartin, @eyurtsev
This commit is contained in:
parent
e41b382e1c
commit
77ae8084a0
28
docs/extras/ecosystem/integrations/airtable.md
Normal file
28
docs/extras/ecosystem/integrations/airtable.md
Normal file
@ -0,0 +1,28 @@
|
||||
# Airtable
|
||||
|
||||
>[Airtable](https://en.wikipedia.org/wiki/Airtable) is a cloud collaboration service.
|
||||
`Airtable` is a spreadsheet-database hybrid, with the features of a database but applied to a spreadsheet.
|
||||
> The fields in an Airtable table are similar to cells in a spreadsheet, but have types such as 'checkbox',
|
||||
> 'phone number', and 'drop-down list', and can reference file attachments like images.
|
||||
|
||||
>Users can create a database, set up column types, add records, link tables to one another, collaborate, sort records
|
||||
> and publish views to external websites.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
```bash
|
||||
pip install pyairtable
|
||||
```
|
||||
|
||||
* Get your [API key](https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens).
|
||||
* Get the [ID of your base](https://airtable.com/developers/web/api/introduction).
|
||||
* Get the [table ID from the table url](https://www.highviewapps.com/kb/where-can-i-find-the-airtable-base-id-and-table-id/#:~:text=Both%20the%20Airtable%20Base%20ID,URL%20that%20begins%20with%20tbl).
|
||||
|
||||
## Document Loader
|
||||
|
||||
|
||||
```python
|
||||
from langchain.document_loaders import AirtableLoader
|
||||
```
|
||||
|
||||
See an [example](/docs/modules/data_connection/document_loaders/integrations/airtable.html).
|
@ -134,7 +134,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -145,10 +145,10 @@ from langchain.document_loaders.youtube import (
|
||||
YoutubeLoader,
|
||||
)
|
||||
|
||||
# Legacy: only for backwards compat. Use PyPDFLoader instead
|
||||
# Legacy: only for backwards compatibility. Use PyPDFLoader instead
|
||||
PagedPDFSplitter = PyPDFLoader
|
||||
|
||||
# For backwards compatability
|
||||
# For backwards compatibility
|
||||
TelegramChatLoader = TelegramChatFileLoader
|
||||
|
||||
__all__ = [
|
||||
|
@ -8,15 +8,20 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class AcreomLoader(BaseLoader):
|
||||
"""Loader that loads acreom vault from a directory."""
|
||||
|
||||
FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
|
||||
"""Regex to match front matter metadata in markdown files."""
|
||||
|
||||
def __init__(
|
||||
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
|
||||
):
|
||||
"""Initialize with path."""
|
||||
self.file_path = path
|
||||
"""Path to the directory containing the markdown files."""
|
||||
self.encoding = encoding
|
||||
"""Encoding to use when reading the files."""
|
||||
self.collect_metadata = collect_metadata
|
||||
"""Whether to collect metadata from the front matter."""
|
||||
|
||||
def _parse_front_matter(self, content: str) -> dict:
|
||||
"""Parse front matter metadata from the content and return it as a dict."""
|
||||
|
@ -11,11 +11,11 @@ class AirbyteJSONLoader(BaseLoader):
|
||||
"""Loader that loads local airbyte json files."""
|
||||
|
||||
def __init__(self, file_path: str):
|
||||
"""Initialize with file path. This should start with '/tmp/airbyte_local/'."""
|
||||
"""Initialize with a file path. This should start with '/tmp/airbyte_local/'."""
|
||||
self.file_path = file_path
|
||||
"""Path to the directory containing the json files."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load file."""
|
||||
text = ""
|
||||
for line in open(self.file_path, "r"):
|
||||
data = json.loads(line)["_airbyte_data"]
|
||||
|
@ -10,11 +10,14 @@ class AirtableLoader(BaseLoader):
|
||||
def __init__(self, api_token: str, table_id: str, base_id: str):
|
||||
"""Initialize with API token and the IDs for table and base"""
|
||||
self.api_token = api_token
|
||||
"""Airtable API token."""
|
||||
self.table_id = table_id
|
||||
"""Airtable table ID."""
|
||||
self.base_id = base_id
|
||||
"""Airtable base ID."""
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Lazy load records from table."""
|
||||
"""Lazy load Documents from table."""
|
||||
|
||||
from pyairtable import Table
|
||||
|
||||
@ -32,5 +35,5 @@ class AirtableLoader(BaseLoader):
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load Table."""
|
||||
"""Load Documents from table."""
|
||||
return list(self.lazy_load())
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Logic for loading documents from Apify datasets."""
|
||||
from typing import Any, Callable, Dict, List
|
||||
|
||||
from pydantic import BaseModel, root_validator
|
||||
@ -8,9 +7,10 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class ApifyDatasetLoader(BaseLoader, BaseModel):
|
||||
"""Logic for loading documents from Apify datasets."""
|
||||
"""Loading Documents from Apify datasets."""
|
||||
|
||||
apify_client: Any
|
||||
"""An instance of the ApifyClient class from the apify-client Python package."""
|
||||
dataset_id: str
|
||||
"""The ID of the dataset on the Apify platform."""
|
||||
dataset_mapping_function: Callable[[Dict], Document]
|
||||
@ -34,7 +34,11 @@ class ApifyDatasetLoader(BaseLoader, BaseModel):
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate environment."""
|
||||
"""Validate environment.
|
||||
|
||||
Args:
|
||||
values: The values to validate.
|
||||
"""
|
||||
|
||||
try:
|
||||
from apify_client import ApifyClient
|
||||
|
@ -19,8 +19,11 @@ class ArxivLoader(BaseLoader):
|
||||
load_all_available_meta: Optional[bool] = False,
|
||||
):
|
||||
self.query = query
|
||||
"""The query to be passed to the arxiv.org API."""
|
||||
self.load_max_docs = load_max_docs
|
||||
"""The maximum number of documents to load."""
|
||||
self.load_all_available_meta = load_all_available_meta
|
||||
"""Whether to load all available metadata."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
arxiv_client = ArxivAPIWrapper(
|
||||
|
@ -9,7 +9,7 @@ class AZLyricsLoader(WebBaseLoader):
|
||||
"""Loader that loads AZLyrics webpages."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
"""Load webpages into Documents."""
|
||||
soup = self.scrape()
|
||||
title = soup.title.text
|
||||
lyrics = soup.find_all("div", {"class": ""})[2].text
|
||||
|
@ -9,20 +9,23 @@ from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class AzureBlobStorageContainerLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from Azure Blob Storage."""
|
||||
"""Loading Documents from Azure Blob Storage."""
|
||||
|
||||
def __init__(self, conn_str: str, container: str, prefix: str = ""):
|
||||
"""Initialize with connection string, container and blob prefix."""
|
||||
self.conn_str = conn_str
|
||||
"""Connection string for Azure Blob Storage."""
|
||||
self.container = container
|
||||
"""Container name."""
|
||||
self.prefix = prefix
|
||||
"""Prefix for blob names."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
from azure.storage.blob import ContainerClient
|
||||
except ImportError as exc:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import azure storage blob python package. "
|
||||
"Please install it with `pip install azure-storage-blob`."
|
||||
) from exc
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Loading logic for loading documents from an Azure Blob Storage file."""
|
||||
import os
|
||||
import tempfile
|
||||
from typing import List
|
||||
@ -9,20 +8,23 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader
|
||||
|
||||
|
||||
class AzureBlobStorageFileLoader(BaseLoader):
|
||||
"""Loading logic for loading documents from Azure Blob Storage."""
|
||||
"""Loading Documents from Azure Blob Storage."""
|
||||
|
||||
def __init__(self, conn_str: str, container: str, blob_name: str):
|
||||
"""Initialize with connection string, container and blob name."""
|
||||
self.conn_str = conn_str
|
||||
"""Connection string for Azure Blob Storage."""
|
||||
self.container = container
|
||||
"""Container name."""
|
||||
self.blob = blob_name
|
||||
"""Blob name."""
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
from azure.storage.blob import BlobClient
|
||||
except ImportError as exc:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import azure storage blob python package. "
|
||||
"Please install it with `pip install azure-storage-blob`."
|
||||
) from exc
|
||||
|
@ -8,10 +8,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
|
||||
|
||||
|
||||
class BaseLoader(ABC):
|
||||
"""Interface for loading documents.
|
||||
"""Interface for loading Documents.
|
||||
|
||||
Implementations should implement the lazy-loading method using generators
|
||||
to avoid loading all documents into memory at once.
|
||||
to avoid loading all Documents into memory at once.
|
||||
|
||||
The `load` method will remain as is for backwards compatibility, but its
|
||||
implementation should be just `list(self.lazy_load())`.
|
||||
@ -22,12 +22,20 @@ class BaseLoader(ABC):
|
||||
# This method returns a List which is materialized in memory.
|
||||
@abstractmethod
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects."""
|
||||
"""Load data into Document objects."""
|
||||
|
||||
def load_and_split(
|
||||
self, text_splitter: Optional[TextSplitter] = None
|
||||
) -> List[Document]:
|
||||
"""Load documents and split into chunks."""
|
||||
"""Load Documents and split into chunks. Chunks are returned as Documents.
|
||||
|
||||
Args:
|
||||
text_splitter: TextSplitter instance to use for splitting documents.
|
||||
Defaults to RecursiveCharacterTextSplitter.
|
||||
|
||||
Returns:
|
||||
List of Documents.
|
||||
"""
|
||||
if text_splitter is None:
|
||||
_text_splitter: TextSplitter = RecursiveCharacterTextSplitter()
|
||||
else:
|
||||
@ -40,7 +48,7 @@ class BaseLoader(ABC):
|
||||
def lazy_load(
|
||||
self,
|
||||
) -> Iterator[Document]:
|
||||
"""A lazy loader for document content."""
|
||||
"""A lazy loader for Documents."""
|
||||
raise NotImplementedError(
|
||||
f"{self.__class__.__name__} does not implement lazy_load()"
|
||||
)
|
||||
@ -49,7 +57,7 @@ class BaseLoader(ABC):
|
||||
class BaseBlobParser(ABC):
|
||||
"""Abstract interface for blob parsers.
|
||||
|
||||
A blob parser is provides a way to parse raw data stored in a blob into one
|
||||
A blob parser provides a way to parse raw data stored in a blob into one
|
||||
or more documents.
|
||||
|
||||
The parser can be composed with blob loaders, making it easy to re-use
|
||||
|
@ -34,8 +34,12 @@ class BibtexLoader(BaseLoader):
|
||||
|
||||
Args:
|
||||
file_path: Path to the bibtex file.
|
||||
parser: The parser to use. If None, a default parser is used.
|
||||
max_docs: Max number of associated documents to load. Use -1 means
|
||||
no limit.
|
||||
max_content_chars: Maximum number of characters to load from the PDF.
|
||||
load_extra_metadata: Whether to load extra metadata from the PDF.
|
||||
file_pattern: Regex pattern to match the file name in the bibtex.
|
||||
"""
|
||||
self.file_path = file_path
|
||||
self.parser = parser or BibtexparserWrapper()
|
||||
@ -70,9 +74,7 @@ class BibtexLoader(BaseLoader):
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
"""Load bibtex file using bibtexparser and get the article texts plus the
|
||||
|
||||
article metadata.
|
||||
|
||||
See https://bibtexparser.readthedocs.io/en/master/
|
||||
|
||||
Returns:
|
||||
|
@ -37,7 +37,7 @@ class BigQueryLoader(BaseLoader):
|
||||
metadata_columns: Optional. The columns to write into the `metadata` of the
|
||||
document.
|
||||
credentials : google.auth.credentials.Credentials, optional
|
||||
Credentials for accessing Google APIs. Use this parameter to override
|
||||
Credentials for accessing Google APIs. Use this parameter to override
|
||||
default credentials, such as to use Compute Engine
|
||||
(`google.auth.compute_engine.Credentials`) or Service Account
|
||||
(`google.oauth2.service_account.Credentials`) credentials directly.
|
||||
@ -52,7 +52,7 @@ class BigQueryLoader(BaseLoader):
|
||||
try:
|
||||
from google.cloud import bigquery
|
||||
except ImportError as ex:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"Could not import google-cloud-bigquery python package. "
|
||||
"Please install it with `pip install google-cloud-bigquery`."
|
||||
) from ex
|
||||
|
@ -13,11 +13,15 @@ class BiliBiliLoader(BaseLoader):
|
||||
"""Loader that loads bilibili transcripts."""
|
||||
|
||||
def __init__(self, video_urls: List[str]):
|
||||
"""Initialize with bilibili url."""
|
||||
"""Initialize with bilibili url.
|
||||
|
||||
Args:
|
||||
video_urls: List of bilibili urls.
|
||||
"""
|
||||
self.video_urls = video_urls
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load from bilibili url."""
|
||||
"""Load Documents from bilibili url."""
|
||||
results = []
|
||||
for url in self.video_urls:
|
||||
transcript, video_info = self._get_bilibili_subs_and_info(url)
|
||||
@ -30,7 +34,7 @@ class BiliBiliLoader(BaseLoader):
|
||||
try:
|
||||
from bilibili_api import sync, video
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
raise ImportError(
|
||||
"requests package not found, please install it with "
|
||||
"`pip install bilibili-api-python`"
|
||||
)
|
||||
|
@ -12,7 +12,7 @@ from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class BlackboardLoader(WebBaseLoader):
|
||||
"""Loader that loads all documents from a Blackboard course.
|
||||
"""Loads all documents from a Blackboard course.
|
||||
|
||||
This loader is not compatible with all Blackboard courses. It is only
|
||||
compatible with courses that use the new Blackboard interface.
|
||||
@ -34,8 +34,11 @@ class BlackboardLoader(WebBaseLoader):
|
||||
"""
|
||||
|
||||
base_url: str
|
||||
"""Base url of the blackboard course."""
|
||||
folder_path: str
|
||||
"""Path to the folder containing the documents."""
|
||||
load_all_recursively: bool
|
||||
"""If True, load all documents recursively."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
@ -64,7 +67,7 @@ class BlackboardLoader(WebBaseLoader):
|
||||
try:
|
||||
self.base_url = blackboard_course_url.split("/webapps/blackboard")[0]
|
||||
except IndexError:
|
||||
raise ValueError(
|
||||
raise IndexError(
|
||||
"Invalid blackboard course url. "
|
||||
"Please provide a url that starts with "
|
||||
"https://<blackboard_url>/webapps/blackboard"
|
||||
@ -94,10 +97,10 @@ class BlackboardLoader(WebBaseLoader):
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects.
|
||||
"""Load data into Document objects.
|
||||
|
||||
Returns:
|
||||
List of documents.
|
||||
List of Documents.
|
||||
"""
|
||||
if self.load_all_recursively:
|
||||
soup_info = self.scrape()
|
||||
@ -118,7 +121,7 @@ class BlackboardLoader(WebBaseLoader):
|
||||
return self._get_documents(soup_info)
|
||||
|
||||
def _get_folder_path(self, soup: Any) -> str:
|
||||
"""Get the folder path to save the documents in.
|
||||
"""Get the folder path to save the Documents in.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup4 soup object.
|
||||
@ -229,7 +232,7 @@ class BlackboardLoader(WebBaseLoader):
|
||||
return relative_paths
|
||||
|
||||
def download(self, path: str) -> None:
|
||||
"""Download a file from a url.
|
||||
"""Download a file from an url.
|
||||
|
||||
Args:
|
||||
path: Path to the file.
|
||||
@ -243,7 +246,7 @@ class BlackboardLoader(WebBaseLoader):
|
||||
f.write(response.content)
|
||||
|
||||
def parse_filename(self, url: str) -> str:
|
||||
"""Parse the filename from a url.
|
||||
"""Parse the filename from an url.
|
||||
|
||||
Args:
|
||||
url: Url to parse the filename from.
|
||||
@ -257,7 +260,7 @@ class BlackboardLoader(WebBaseLoader):
|
||||
return self._parse_filename_from_url(url)
|
||||
|
||||
def _parse_filename_from_url(self, url: str) -> str:
|
||||
"""Parse the filename from a url.
|
||||
"""Parse the filename from an url.
|
||||
|
||||
Args:
|
||||
url: Url to parse the filename from.
|
||||
|
@ -55,6 +55,16 @@ class BlockchainDocumentLoader(BaseLoader):
|
||||
get_all_tokens: bool = False,
|
||||
max_execution_time: Optional[int] = None,
|
||||
):
|
||||
"""
|
||||
|
||||
Args:
|
||||
contract_address: The address of the smart contract.
|
||||
blockchainType: The blockchain type.
|
||||
api_key: The Alchemy API key.
|
||||
startToken: The start token for pagination.
|
||||
get_all_tokens: Whether to get all tokens on the contract.
|
||||
max_execution_time: The maximum execution time (sec).
|
||||
"""
|
||||
self.contract_address = contract_address
|
||||
self.blockchainType = blockchainType.value
|
||||
self.api_key = os.environ.get("ALCHEMY_API_KEY") or api_key
|
||||
|
@ -1,4 +1,3 @@
|
||||
"""Load conversations from ChatGPT data export"""
|
||||
import datetime
|
||||
import json
|
||||
from typing import List
|
||||
@ -29,9 +28,15 @@ def concatenate_rows(message: dict, title: str) -> str:
|
||||
|
||||
|
||||
class ChatGPTLoader(BaseLoader):
|
||||
"""Loader that loads conversations from exported ChatGPT data."""
|
||||
"""Load conversations from exported ChatGPT data."""
|
||||
|
||||
def __init__(self, log_file: str, num_logs: int = -1):
|
||||
"""
|
||||
|
||||
Args:
|
||||
log_file: Path to the log file
|
||||
num_logs: Number of logs to load. If 0, load all logs.
|
||||
"""
|
||||
self.log_file = log_file
|
||||
self.num_logs = num_logs
|
||||
|
||||
|
@ -284,9 +284,7 @@
|
||||
" error=False, # Only runs that succeed\n",
|
||||
")\n",
|
||||
"for run in runs:\n",
|
||||
" client.create_example(\n",
|
||||
" inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n",
|
||||
" )"
|
||||
" client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -333,7 +331,7 @@
|
||||
"eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n",
|
||||
"\n",
|
||||
"# Measures accuracy against ground truth\n",
|
||||
"qa_evaluator = get_qa_evaluator(eval_llm) \n",
|
||||
"qa_evaluator = get_qa_evaluator(eval_llm)\n",
|
||||
"\n",
|
||||
"# Measures how effective and efficient the agent's actions are\n",
|
||||
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
|
||||
@ -392,13 +390,13 @@
|
||||
"llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n",
|
||||
"tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# Since chains can be stateful (e.g. they can have memory), we need provide\n",
|
||||
"# a way to initialize a new chain for each row in the dataset. This is done\n",
|
||||
"# by passing in a factory function that returns a new chain for each row.\n",
|
||||
"def agent_factory():\n",
|
||||
" return initialize_agent(\n",
|
||||
" tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
|
||||
")\n",
|
||||
" return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# If your chain is NOT stateful, your factory can return the object directly\n",
|
||||
"# to improve runtime performance. For example:\n",
|
||||
@ -477,7 +475,7 @@
|
||||
"source": [
|
||||
"from langchain.client import (\n",
|
||||
" arun_on_dataset,\n",
|
||||
" run_on_dataset, # Available if your chain doesn't support async calls.\n",
|
||||
" run_on_dataset, # Available if your chain doesn't support async calls.\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"?arun_on_dataset"
|
||||
@ -616,9 +614,7 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"agent = initialize_agent(\n",
|
||||
" tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n",
|
||||
")"
|
||||
"agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
Loading…
Reference in New Issue
Block a user