diff --git a/docs/extras/ecosystem/integrations/airtable.md b/docs/extras/ecosystem/integrations/airtable.md new file mode 100644 index 00000000000..435f7fcce55 --- /dev/null +++ b/docs/extras/ecosystem/integrations/airtable.md @@ -0,0 +1,28 @@ +# Airtable + +>[Airtable](https://en.wikipedia.org/wiki/Airtable) is a cloud collaboration service. +`Airtable` is a spreadsheet-database hybrid, with the features of a database but applied to a spreadsheet. +> The fields in an Airtable table are similar to cells in a spreadsheet, but have types such as 'checkbox', +> 'phone number', and 'drop-down list', and can reference file attachments like images. + +>Users can create a database, set up column types, add records, link tables to one another, collaborate, sort records +> and publish views to external websites. + +## Installation and Setup + +```bash +pip install pyairtable +``` + +* Get your [API key](https://support.airtable.com/docs/creating-and-using-api-keys-and-access-tokens). +* Get the [ID of your base](https://airtable.com/developers/web/api/introduction). +* Get the [table ID from the table url](https://www.highviewapps.com/kb/where-can-i-find-the-airtable-base-id-and-table-id/#:~:text=Both%20the%20Airtable%20Base%20ID,URL%20that%20begins%20with%20tbl). + +## Document Loader + + +```python +from langchain.document_loaders import AirtableLoader +``` + +See an [example](/docs/modules/data_connection/document_loaders/integrations/airtable.html). diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/airtable.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/airtable.ipynb index 98efa237e11..0ac03425d06 100644 --- a/docs/extras/modules/data_connection/document_loaders/integrations/airtable.ipynb +++ b/docs/extras/modules/data_connection/document_loaders/integrations/airtable.ipynb @@ -134,7 +134,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index a32397ab96d..8fcecd3abf5 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -145,10 +145,10 @@ from langchain.document_loaders.youtube import ( YoutubeLoader, ) -# Legacy: only for backwards compat. Use PyPDFLoader instead +# Legacy: only for backwards compatibility. Use PyPDFLoader instead PagedPDFSplitter = PyPDFLoader -# For backwards compatability +# For backwards compatibility TelegramChatLoader = TelegramChatFileLoader __all__ = [ diff --git a/langchain/document_loaders/acreom.py b/langchain/document_loaders/acreom.py index 1a31b94ff04..e43a72751c6 100644 --- a/langchain/document_loaders/acreom.py +++ b/langchain/document_loaders/acreom.py @@ -8,15 +8,20 @@ from langchain.document_loaders.base import BaseLoader class AcreomLoader(BaseLoader): + """Loader that loads acreom vault from a directory.""" + FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) + """Regex to match front matter metadata in markdown files.""" def __init__( self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True ): - """Initialize with path.""" self.file_path = path + """Path to the directory containing the markdown files.""" self.encoding = encoding + """Encoding to use when reading the files.""" self.collect_metadata = collect_metadata + """Whether to collect metadata from the front matter.""" def _parse_front_matter(self, content: str) -> dict: """Parse front matter metadata from the content and return it as a dict.""" diff --git a/langchain/document_loaders/airbyte_json.py b/langchain/document_loaders/airbyte_json.py index 824843447fe..f92b4e408cf 100644 --- a/langchain/document_loaders/airbyte_json.py +++ b/langchain/document_loaders/airbyte_json.py @@ -11,11 +11,11 @@ class AirbyteJSONLoader(BaseLoader): """Loader that loads local airbyte json files.""" def __init__(self, file_path: str): - """Initialize with file path. This should start with '/tmp/airbyte_local/'.""" + """Initialize with a file path. This should start with '/tmp/airbyte_local/'.""" self.file_path = file_path + """Path to the directory containing the json files.""" def load(self) -> List[Document]: - """Load file.""" text = "" for line in open(self.file_path, "r"): data = json.loads(line)["_airbyte_data"] diff --git a/langchain/document_loaders/airtable.py b/langchain/document_loaders/airtable.py index 0e2f34a2f1b..824799b2841 100644 --- a/langchain/document_loaders/airtable.py +++ b/langchain/document_loaders/airtable.py @@ -10,11 +10,14 @@ class AirtableLoader(BaseLoader): def __init__(self, api_token: str, table_id: str, base_id: str): """Initialize with API token and the IDs for table and base""" self.api_token = api_token + """Airtable API token.""" self.table_id = table_id + """Airtable table ID.""" self.base_id = base_id + """Airtable base ID.""" def lazy_load(self) -> Iterator[Document]: - """Lazy load records from table.""" + """Lazy load Documents from table.""" from pyairtable import Table @@ -32,5 +35,5 @@ class AirtableLoader(BaseLoader): ) def load(self) -> List[Document]: - """Load Table.""" + """Load Documents from table.""" return list(self.lazy_load()) diff --git a/langchain/document_loaders/apify_dataset.py b/langchain/document_loaders/apify_dataset.py index 469ae773ca1..7c0268fa4a8 100644 --- a/langchain/document_loaders/apify_dataset.py +++ b/langchain/document_loaders/apify_dataset.py @@ -1,4 +1,3 @@ -"""Logic for loading documents from Apify datasets.""" from typing import Any, Callable, Dict, List from pydantic import BaseModel, root_validator @@ -8,9 +7,10 @@ from langchain.document_loaders.base import BaseLoader class ApifyDatasetLoader(BaseLoader, BaseModel): - """Logic for loading documents from Apify datasets.""" + """Loading Documents from Apify datasets.""" apify_client: Any + """An instance of the ApifyClient class from the apify-client Python package.""" dataset_id: str """The ID of the dataset on the Apify platform.""" dataset_mapping_function: Callable[[Dict], Document] @@ -34,7 +34,11 @@ class ApifyDatasetLoader(BaseLoader, BaseModel): @root_validator() def validate_environment(cls, values: Dict) -> Dict: - """Validate environment.""" + """Validate environment. + + Args: + values: The values to validate. + """ try: from apify_client import ApifyClient diff --git a/langchain/document_loaders/arxiv.py b/langchain/document_loaders/arxiv.py index 612788add49..d0ae9452385 100644 --- a/langchain/document_loaders/arxiv.py +++ b/langchain/document_loaders/arxiv.py @@ -19,8 +19,11 @@ class ArxivLoader(BaseLoader): load_all_available_meta: Optional[bool] = False, ): self.query = query + """The query to be passed to the arxiv.org API.""" self.load_max_docs = load_max_docs + """The maximum number of documents to load.""" self.load_all_available_meta = load_all_available_meta + """Whether to load all available metadata.""" def load(self) -> List[Document]: arxiv_client = ArxivAPIWrapper( diff --git a/langchain/document_loaders/azlyrics.py b/langchain/document_loaders/azlyrics.py index 0947946c116..219ea5a8738 100644 --- a/langchain/document_loaders/azlyrics.py +++ b/langchain/document_loaders/azlyrics.py @@ -9,7 +9,7 @@ class AZLyricsLoader(WebBaseLoader): """Loader that loads AZLyrics webpages.""" def load(self) -> List[Document]: - """Load webpage.""" + """Load webpages into Documents.""" soup = self.scrape() title = soup.title.text lyrics = soup.find_all("div", {"class": ""})[2].text diff --git a/langchain/document_loaders/azure_blob_storage_container.py b/langchain/document_loaders/azure_blob_storage_container.py index f63716e0db0..12155d7fd31 100644 --- a/langchain/document_loaders/azure_blob_storage_container.py +++ b/langchain/document_loaders/azure_blob_storage_container.py @@ -9,20 +9,23 @@ from langchain.document_loaders.base import BaseLoader class AzureBlobStorageContainerLoader(BaseLoader): - """Loading logic for loading documents from Azure Blob Storage.""" + """Loading Documents from Azure Blob Storage.""" def __init__(self, conn_str: str, container: str, prefix: str = ""): """Initialize with connection string, container and blob prefix.""" self.conn_str = conn_str + """Connection string for Azure Blob Storage.""" self.container = container + """Container name.""" self.prefix = prefix + """Prefix for blob names.""" def load(self) -> List[Document]: """Load documents.""" try: from azure.storage.blob import ContainerClient except ImportError as exc: - raise ValueError( + raise ImportError( "Could not import azure storage blob python package. " "Please install it with `pip install azure-storage-blob`." ) from exc diff --git a/langchain/document_loaders/azure_blob_storage_file.py b/langchain/document_loaders/azure_blob_storage_file.py index 2fea91ffb32..64b7e2d6778 100644 --- a/langchain/document_loaders/azure_blob_storage_file.py +++ b/langchain/document_loaders/azure_blob_storage_file.py @@ -1,4 +1,3 @@ -"""Loading logic for loading documents from an Azure Blob Storage file.""" import os import tempfile from typing import List @@ -9,20 +8,23 @@ from langchain.document_loaders.unstructured import UnstructuredFileLoader class AzureBlobStorageFileLoader(BaseLoader): - """Loading logic for loading documents from Azure Blob Storage.""" + """Loading Documents from Azure Blob Storage.""" def __init__(self, conn_str: str, container: str, blob_name: str): """Initialize with connection string, container and blob name.""" self.conn_str = conn_str + """Connection string for Azure Blob Storage.""" self.container = container + """Container name.""" self.blob = blob_name + """Blob name.""" def load(self) -> List[Document]: """Load documents.""" try: from azure.storage.blob import BlobClient except ImportError as exc: - raise ValueError( + raise ImportError( "Could not import azure storage blob python package. " "Please install it with `pip install azure-storage-blob`." ) from exc diff --git a/langchain/document_loaders/base.py b/langchain/document_loaders/base.py index f176401a8cf..b41f985d510 100644 --- a/langchain/document_loaders/base.py +++ b/langchain/document_loaders/base.py @@ -8,10 +8,10 @@ from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter class BaseLoader(ABC): - """Interface for loading documents. + """Interface for loading Documents. Implementations should implement the lazy-loading method using generators - to avoid loading all documents into memory at once. + to avoid loading all Documents into memory at once. The `load` method will remain as is for backwards compatibility, but its implementation should be just `list(self.lazy_load())`. @@ -22,12 +22,20 @@ class BaseLoader(ABC): # This method returns a List which is materialized in memory. @abstractmethod def load(self) -> List[Document]: - """Load data into document objects.""" + """Load data into Document objects.""" def load_and_split( self, text_splitter: Optional[TextSplitter] = None ) -> List[Document]: - """Load documents and split into chunks.""" + """Load Documents and split into chunks. Chunks are returned as Documents. + + Args: + text_splitter: TextSplitter instance to use for splitting documents. + Defaults to RecursiveCharacterTextSplitter. + + Returns: + List of Documents. + """ if text_splitter is None: _text_splitter: TextSplitter = RecursiveCharacterTextSplitter() else: @@ -40,7 +48,7 @@ class BaseLoader(ABC): def lazy_load( self, ) -> Iterator[Document]: - """A lazy loader for document content.""" + """A lazy loader for Documents.""" raise NotImplementedError( f"{self.__class__.__name__} does not implement lazy_load()" ) @@ -49,7 +57,7 @@ class BaseLoader(ABC): class BaseBlobParser(ABC): """Abstract interface for blob parsers. - A blob parser is provides a way to parse raw data stored in a blob into one + A blob parser provides a way to parse raw data stored in a blob into one or more documents. The parser can be composed with blob loaders, making it easy to re-use diff --git a/langchain/document_loaders/bibtex.py b/langchain/document_loaders/bibtex.py index 30c34cd6b25..d538b1d44ca 100644 --- a/langchain/document_loaders/bibtex.py +++ b/langchain/document_loaders/bibtex.py @@ -34,8 +34,12 @@ class BibtexLoader(BaseLoader): Args: file_path: Path to the bibtex file. + parser: The parser to use. If None, a default parser is used. max_docs: Max number of associated documents to load. Use -1 means no limit. + max_content_chars: Maximum number of characters to load from the PDF. + load_extra_metadata: Whether to load extra metadata from the PDF. + file_pattern: Regex pattern to match the file name in the bibtex. """ self.file_path = file_path self.parser = parser or BibtexparserWrapper() @@ -70,9 +74,7 @@ class BibtexLoader(BaseLoader): def lazy_load(self) -> Iterator[Document]: """Load bibtex file using bibtexparser and get the article texts plus the - article metadata. - See https://bibtexparser.readthedocs.io/en/master/ Returns: diff --git a/langchain/document_loaders/bigquery.py b/langchain/document_loaders/bigquery.py index a185bb3c909..b49b1542f2b 100644 --- a/langchain/document_loaders/bigquery.py +++ b/langchain/document_loaders/bigquery.py @@ -37,7 +37,7 @@ class BigQueryLoader(BaseLoader): metadata_columns: Optional. The columns to write into the `metadata` of the document. credentials : google.auth.credentials.Credentials, optional - Credentials for accessing Google APIs. Use this parameter to override + Credentials for accessing Google APIs. Use this parameter to override default credentials, such as to use Compute Engine (`google.auth.compute_engine.Credentials`) or Service Account (`google.oauth2.service_account.Credentials`) credentials directly. @@ -52,7 +52,7 @@ class BigQueryLoader(BaseLoader): try: from google.cloud import bigquery except ImportError as ex: - raise ValueError( + raise ImportError( "Could not import google-cloud-bigquery python package. " "Please install it with `pip install google-cloud-bigquery`." ) from ex diff --git a/langchain/document_loaders/bilibili.py b/langchain/document_loaders/bilibili.py index 08aeb5baffd..d7fe3c80628 100644 --- a/langchain/document_loaders/bilibili.py +++ b/langchain/document_loaders/bilibili.py @@ -13,11 +13,15 @@ class BiliBiliLoader(BaseLoader): """Loader that loads bilibili transcripts.""" def __init__(self, video_urls: List[str]): - """Initialize with bilibili url.""" + """Initialize with bilibili url. + + Args: + video_urls: List of bilibili urls. + """ self.video_urls = video_urls def load(self) -> List[Document]: - """Load from bilibili url.""" + """Load Documents from bilibili url.""" results = [] for url in self.video_urls: transcript, video_info = self._get_bilibili_subs_and_info(url) @@ -30,7 +34,7 @@ class BiliBiliLoader(BaseLoader): try: from bilibili_api import sync, video except ImportError: - raise ValueError( + raise ImportError( "requests package not found, please install it with " "`pip install bilibili-api-python`" ) diff --git a/langchain/document_loaders/blackboard.py b/langchain/document_loaders/blackboard.py index ccddf19ed6d..2564a648dc6 100644 --- a/langchain/document_loaders/blackboard.py +++ b/langchain/document_loaders/blackboard.py @@ -12,7 +12,7 @@ from langchain.document_loaders.web_base import WebBaseLoader class BlackboardLoader(WebBaseLoader): - """Loader that loads all documents from a Blackboard course. + """Loads all documents from a Blackboard course. This loader is not compatible with all Blackboard courses. It is only compatible with courses that use the new Blackboard interface. @@ -34,8 +34,11 @@ class BlackboardLoader(WebBaseLoader): """ base_url: str + """Base url of the blackboard course.""" folder_path: str + """Path to the folder containing the documents.""" load_all_recursively: bool + """If True, load all documents recursively.""" def __init__( self, @@ -64,7 +67,7 @@ class BlackboardLoader(WebBaseLoader): try: self.base_url = blackboard_course_url.split("/webapps/blackboard")[0] except IndexError: - raise ValueError( + raise IndexError( "Invalid blackboard course url. " "Please provide a url that starts with " "https:///webapps/blackboard" @@ -94,10 +97,10 @@ class BlackboardLoader(WebBaseLoader): ) def load(self) -> List[Document]: - """Load data into document objects. + """Load data into Document objects. Returns: - List of documents. + List of Documents. """ if self.load_all_recursively: soup_info = self.scrape() @@ -118,7 +121,7 @@ class BlackboardLoader(WebBaseLoader): return self._get_documents(soup_info) def _get_folder_path(self, soup: Any) -> str: - """Get the folder path to save the documents in. + """Get the folder path to save the Documents in. Args: soup: BeautifulSoup4 soup object. @@ -229,7 +232,7 @@ class BlackboardLoader(WebBaseLoader): return relative_paths def download(self, path: str) -> None: - """Download a file from a url. + """Download a file from an url. Args: path: Path to the file. @@ -243,7 +246,7 @@ class BlackboardLoader(WebBaseLoader): f.write(response.content) def parse_filename(self, url: str) -> str: - """Parse the filename from a url. + """Parse the filename from an url. Args: url: Url to parse the filename from. @@ -257,7 +260,7 @@ class BlackboardLoader(WebBaseLoader): return self._parse_filename_from_url(url) def _parse_filename_from_url(self, url: str) -> str: - """Parse the filename from a url. + """Parse the filename from an url. Args: url: Url to parse the filename from. diff --git a/langchain/document_loaders/blockchain.py b/langchain/document_loaders/blockchain.py index b1c6bce7122..6b103fe1c8c 100644 --- a/langchain/document_loaders/blockchain.py +++ b/langchain/document_loaders/blockchain.py @@ -55,6 +55,16 @@ class BlockchainDocumentLoader(BaseLoader): get_all_tokens: bool = False, max_execution_time: Optional[int] = None, ): + """ + + Args: + contract_address: The address of the smart contract. + blockchainType: The blockchain type. + api_key: The Alchemy API key. + startToken: The start token for pagination. + get_all_tokens: Whether to get all tokens on the contract. + max_execution_time: The maximum execution time (sec). + """ self.contract_address = contract_address self.blockchainType = blockchainType.value self.api_key = os.environ.get("ALCHEMY_API_KEY") or api_key diff --git a/langchain/document_loaders/chatgpt.py b/langchain/document_loaders/chatgpt.py index 57a6eed7c41..d281b2ee13d 100644 --- a/langchain/document_loaders/chatgpt.py +++ b/langchain/document_loaders/chatgpt.py @@ -1,4 +1,3 @@ -"""Load conversations from ChatGPT data export""" import datetime import json from typing import List @@ -29,9 +28,15 @@ def concatenate_rows(message: dict, title: str) -> str: class ChatGPTLoader(BaseLoader): - """Loader that loads conversations from exported ChatGPT data.""" + """Load conversations from exported ChatGPT data.""" def __init__(self, log_file: str, num_logs: int = -1): + """ + + Args: + log_file: Path to the log file + num_logs: Number of logs to load. If 0, load all logs. + """ self.log_file = log_file self.num_logs = num_logs diff --git a/langchain/experimental/client/tracing_datasets.ipynb b/langchain/experimental/client/tracing_datasets.ipynb index e72161b212b..fc3cdc8ab98 100644 --- a/langchain/experimental/client/tracing_datasets.ipynb +++ b/langchain/experimental/client/tracing_datasets.ipynb @@ -284,9 +284,7 @@ " error=False, # Only runs that succeed\n", ")\n", "for run in runs:\n", - " client.create_example(\n", - " inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id\n", - " )" + " client.create_example(inputs=run.inputs, outputs=run.outputs, dataset_id=dataset.id)" ] }, { @@ -333,7 +331,7 @@ "eval_llm = ChatOpenAI(model=\"gpt-4\", temperature=0)\n", "\n", "# Measures accuracy against ground truth\n", - "qa_evaluator = get_qa_evaluator(eval_llm) \n", + "qa_evaluator = get_qa_evaluator(eval_llm)\n", "\n", "# Measures how effective and efficient the agent's actions are\n", "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", @@ -392,13 +390,13 @@ "llm = ChatOpenAI(model=\"gpt-3.5-turbo-0613\", temperature=0)\n", "tools = load_tools([\"serpapi\", \"llm-math\"], llm=llm)\n", "\n", + "\n", "# Since chains can be stateful (e.g. they can have memory), we need provide\n", "# a way to initialize a new chain for each row in the dataset. This is done\n", "# by passing in a factory function that returns a new chain for each row.\n", "def agent_factory():\n", - " return initialize_agent(\n", - " tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n", - ")\n", + " return initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)\n", + "\n", "\n", "# If your chain is NOT stateful, your factory can return the object directly\n", "# to improve runtime performance. For example:\n", @@ -477,7 +475,7 @@ "source": [ "from langchain.client import (\n", " arun_on_dataset,\n", - " run_on_dataset, # Available if your chain doesn't support async calls.\n", + " run_on_dataset, # Available if your chain doesn't support async calls.\n", ")\n", "\n", "?arun_on_dataset" @@ -616,9 +614,7 @@ }, "outputs": [], "source": [ - "agent = initialize_agent(\n", - " tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False\n", - ")" + "agent = initialize_agent(tools, llm, agent=AgentType.OPENAI_FUNCTIONS, verbose=False)" ] }, {