From c838de502774557854e120bb7f18c53234d14edf Mon Sep 17 00:00:00 2001 From: Christophe Bornet Date: Fri, 24 May 2024 16:57:55 +0200 Subject: [PATCH] doc: Add doc for CassandraByteStore (#22126) Preview: https://langchain-git-fork-cbornet-doc-cassandrabytestore-langchain.vercel.app/v0.2/docs/integrations/stores/cassandra/ --- docs/docs/integrations/stores/cassandra.ipynb | 228 ++++++++++++++++++ .../langchain_community/storage/__init__.py | 5 + .../tests/unit_tests/storage/test_imports.py | 1 + 3 files changed, 234 insertions(+) create mode 100644 docs/docs/integrations/stores/cassandra.ipynb diff --git a/docs/docs/integrations/stores/cassandra.ipynb b/docs/docs/integrations/stores/cassandra.ipynb new file mode 100644 index 00000000000..bd9413da773 --- /dev/null +++ b/docs/docs/integrations/stores/cassandra.ipynb @@ -0,0 +1,228 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "sidebar_label: Cassandra\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cassandra\n", + "\n", + "[Cassandra](https://cassandra.apache.org/) is a NoSQL, row-oriented, highly scalable and highly available database.\n", + "\n", + "`CassandraByteStore` needs the `cassio` package to be installed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "vscode": { + "languageId": "plaintext" + } + }, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet cassio" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Store takes the following parameters:\n", + "\n", + "* table: The table where to store the data.\n", + "* session: (Optional) The cassandra driver session. If not provided, the cassio resolved session will be used.\n", + "* keyspace: (Optional) The keyspace of the table. If not provided, the cassio resolved keyspace will be used.\n", + "* setup_mode: (Optional) The mode used to create the Cassandra table (SYNC, ASYNC or OFF). Defaults to SYNC." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## CassandraByteStore\n", + "\n", + "The `CassandraByteStore` is an implementation of `ByteStore` that stores the data in your Cassandra instance.\n", + "The store keys must be strings and will be mapped to the `row_id` column of the Cassandra table.\n", + "The store `bytes` values are mapped to the `body_blob` column of the Cassandra table." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.storage import CassandraByteStore" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Init from a cassandra driver Session\n", + "\n", + "You need to create a `cassandra.cluster.Session` object, as described in the [Cassandra driver documentation](https://docs.datastax.com/en/developer/python-driver/latest/api/cassandra/cluster/#module-cassandra.cluster). The details vary (e.g. with network settings and authentication), but this might be something like:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from cassandra.cluster import Cluster\n", + "\n", + "cluster = Cluster()\n", + "session = cluster.connect()" + ] + }, + { + "cell_type": "markdown", + "source": [ + "You need to provide the name of an existing keyspace of the Cassandra instance:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "CASSANDRA_KEYSPACE = input(\"CASSANDRA_KEYSPACE = \")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Creating the store:" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[b'v1', b'v2']\n" + ] + } + ], + "source": [ + "store = CassandraByteStore(\n", + " table=\"my_store\",\n", + " session=session,\n", + " keyspace=CASSANDRA_KEYSPACE,\n", + ")\n", + "\n", + "store.mset([(\"k1\", b\"v1\"), (\"k2\", b\"v2\")])\n", + "print(store.mget([\"k1\", \"k2\"]))" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Init from cassio\n", + "\n", + "It's also possible to use cassio to configure the session and keyspace." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "import cassio\n", + "\n", + "cassio.init(contact_points=\"127.0.0.1\", keyspace=CASSANDRA_KEYSPACE)\n", + "\n", + "store = CassandraByteStore(\n", + " table=\"my_store\",\n", + ")\n", + "\n", + "store.mset([(\"k1\", b\"v1\"), (\"k2\", b\"v2\")])\n", + "print(store.mget([\"k1\", \"k2\"]))" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Usage with CacheBackedEmbeddings\n", + "\n", + "You may use the `CassandraByteStore` in conjunction with a [`CacheBackedEmbeddings`](/docs/how_to/caching_embeddings) to cache the result of embeddings computations.\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from langchain.embeddings import CacheBackedEmbeddings\n", + "from langchain_openai import OpenAIEmbeddings\n", + "\n", + "cassio.init(contact_points=\"127.0.0.1\", keyspace=CASSANDRA_KEYSPACE)\n", + "\n", + "store = CassandraByteStore(\n", + " table=\"my_store\",\n", + ")\n", + "\n", + "embeddings = CacheBackedEmbeddings.from_bytes_store(\n", + " underlying_embeddings=OpenAIEmbeddings(), document_embedding_cache=store\n", + ")" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/libs/community/langchain_community/storage/__init__.py b/libs/community/langchain_community/storage/__init__.py index 4095e40b670..9a73d49110a 100644 --- a/libs/community/langchain_community/storage/__init__.py +++ b/libs/community/langchain_community/storage/__init__.py @@ -22,6 +22,9 @@ if TYPE_CHECKING: AstraDBByteStore, AstraDBStore, ) + from langchain_community.storage.cassandra import ( + CassandraByteStore, + ) from langchain_community.storage.mongodb import ( MongoDBStore, ) @@ -36,6 +39,7 @@ if TYPE_CHECKING: __all__ = [ "AstraDBByteStore", "AstraDBStore", + "CassandraByteStore", "MongoDBStore", "RedisStore", "UpstashRedisByteStore", @@ -45,6 +49,7 @@ __all__ = [ _module_lookup = { "AstraDBByteStore": "langchain_community.storage.astradb", "AstraDBStore": "langchain_community.storage.astradb", + "CassandraByteStore": "langchain_community.storage.cassandra", "MongoDBStore": "langchain_community.storage.mongodb", "RedisStore": "langchain_community.storage.redis", "UpstashRedisByteStore": "langchain_community.storage.upstash_redis", diff --git a/libs/community/tests/unit_tests/storage/test_imports.py b/libs/community/tests/unit_tests/storage/test_imports.py index e624ecd07c2..750b7c5a3e2 100644 --- a/libs/community/tests/unit_tests/storage/test_imports.py +++ b/libs/community/tests/unit_tests/storage/test_imports.py @@ -3,6 +3,7 @@ from langchain_community.storage import __all__, _module_lookup EXPECTED_ALL = [ "AstraDBStore", "AstraDBByteStore", + "CassandraByteStore", "MongoDBStore", "RedisStore", "UpstashRedisByteStore",