diff --git a/docs/docs/integrations/document_loaders/astradb.ipynb b/docs/docs/integrations/document_loaders/astradb.ipynb new file mode 100644 index 00000000000..da8c7c40437 --- /dev/null +++ b/docs/docs/integrations/document_loaders/astradb.ipynb @@ -0,0 +1,185 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "vm8vn9t8DvC_" + }, + "source": [ + "# AstraDB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "DataStax [Astra DB](https://docs.datastax.com/en/astra/home/astra.html) is a serverless vector-capable database built on Cassandra and made conveniently available through an easy-to-use JSON API." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "5WjXERXzFEhg" + }, + "source": [ + "## Overview" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "id": "juAmbgoWD17u" + }, + "source": [ + "The AstraDB Document Loader returns a list of Langchain Documents from an AstraDB database.\n", + "\n", + "The Loader takes the following parameters:\n", + "\n", + "* `api_endpoint`: AstraDB API endpoint. Looks like `https://01234567-89ab-cdef-0123-456789abcdef-us-east1.apps.astra.datastax.com`\n", + "* `token`: AstraDB token. Looks like `AstraCS:6gBhNmsk135....`\n", + "* `collection_name` : AstraDB collection name\n", + "* `namespace`: (Optional) AstraDB namespace\n", + "* `filter_criteria`: (Optional) Filter used in the find query\n", + "* `projection`: (Optional) Projection used in the find query\n", + "* `find_options`: (Optional) Options used in the find query\n", + "* `nb_prefetched`: (Optional) Number of documents pre-fetched by the loader\n", + "* `extraction_function`: (Optional) A function to convert the AstraDB document to the LangChain `page_content` string. Defaults to `json.dumps`\n", + "\n", + "The following metadata is set to the LangChain Documents metadata output:\n", + "\n", + "```python\n", + "{\n", + " metadata : {\n", + " \"namespace\": \"...\", \n", + " \"api_endpoint\": \"...\", \n", + " \"collection\": \"...\"\n", + " }\n", + "}\n", + "```" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load documents with the Document Loader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.document_loaders import AstraDBLoader" + ] + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "from getpass import getpass\n", + "\n", + "ASTRA_DB_API_ENDPOINT = input(\"ASTRA_DB_API_ENDPOINT = \")\n", + "ASTRA_DB_APPLICATION_TOKEN = getpass(\"ASTRA_DB_APPLICATION_TOKEN = \")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-08T12:41:22.643335Z", + "start_time": "2024-01-08T12:40:57.759116Z" + } + }, + "execution_count": 4 + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-08T12:42:25.395162Z", + "start_time": "2024-01-08T12:42:25.391387Z" + } + }, + "outputs": [], + "source": [ + "loader = AstraDBLoader(\n", + " api_endpoint=ASTRA_DB_API_ENDPOINT,\n", + " token=ASTRA_DB_APPLICATION_TOKEN,\n", + " collection_name=\"movie_reviews\",\n", + " projection={\"title\": 1, \"reviewtext\": 1},\n", + " find_options={\"limit\": 10},\n", + ")" + ] + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "docs = loader.load()" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2024-01-08T12:42:30.236489Z", + "start_time": "2024-01-08T12:42:29.612133Z" + } + }, + "execution_count": 7 + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2024-01-08T12:42:31.369394Z", + "start_time": "2024-01-08T12:42:31.359003Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": "Document(page_content='{\"_id\": \"659bdffa16cbc4586b11a423\", \"title\": \"Dangerous Men\", \"reviewtext\": \"\\\\\"Dangerous Men,\\\\\" the picture\\'s production notes inform, took 26 years to reach the big screen. After having seen it, I wonder: What was the rush?\"}', metadata={'namespace': 'default_keyspace', 'api_endpoint': 'https://01234567-89ab-cdef-0123-456789abcdef-us-east1.apps.astra.datastax.com', 'collection': 'movie_reviews'})" + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs[0]" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [ + "5WjXERXzFEhg" + ], + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index ca295e538eb..bf2ac63bce1 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -34,6 +34,7 @@ from langchain_community.document_loaders.arxiv import ArxivLoader from langchain_community.document_loaders.assemblyai import ( AssemblyAIAudioTranscriptLoader, ) +from langchain_community.document_loaders.astradb import AstraDBLoader from langchain_community.document_loaders.async_html import AsyncHtmlLoader from langchain_community.document_loaders.azlyrics import AZLyricsLoader from langchain_community.document_loaders.azure_ai_data import ( @@ -248,6 +249,7 @@ __all__ = [ "ArcGISLoader", "ArxivLoader", "AssemblyAIAudioTranscriptLoader", + "AstraDBLoader", "AsyncHtmlLoader", "AzureAIDataLoader", "AzureAIDocumentIntelligenceLoader", diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index a2101c8830d..d730f6bfc19 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -21,6 +21,7 @@ EXPECTED_ALL = [ "ArcGISLoader", "ArxivLoader", "AssemblyAIAudioTranscriptLoader", + "AstraDBLoader", "AsyncHtmlLoader", "AzureAIDataLoader", "AzureAIDocumentIntelligenceLoader",