diff --git a/docs/docs/integrations/vectorstores/singlestoredb.ipynb b/docs/docs/integrations/vectorstores/singlestoredb.ipynb index fd20785ad45..6cae0d54424 100644 --- a/docs/docs/integrations/vectorstores/singlestoredb.ipynb +++ b/docs/docs/integrations/vectorstores/singlestoredb.ipynb @@ -106,6 +106,14 @@ "print(docs[0].page_content)" ] }, + { + "cell_type": "markdown", + "id": "035cba66", + "metadata": {}, + "source": [ + "Enhance your search efficiency with SingleStore DB version 8.5 or above by leveraging [ANN vector indexes](https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/). By setting `use_vector_index=True` during vector store object creation, you can activate this feature. Additionally, if your vectors differ in dimensionality from the default OpenAI embedding size of 1536, ensure to specify the `vector_size` parameter accordingly. " + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/libs/community/langchain_community/vectorstores/singlestoredb.py b/libs/community/langchain_community/vectorstores/singlestoredb.py index c33858e9657..6d43b6199b8 100644 --- a/libs/community/langchain_community/vectorstores/singlestoredb.py +++ b/libs/community/langchain_community/vectorstores/singlestoredb.py @@ -57,6 +57,10 @@ class SingleStoreDB(VectorStore): content_field: str = "content", metadata_field: str = "metadata", vector_field: str = "vector", + use_vector_index: bool = False, + vector_index_name: str = "", + vector_index_options: Optional[dict] = None, + vector_size: int = 1536, pool_size: int = 5, max_overflow: int = 10, timeout: float = 30, @@ -88,6 +92,27 @@ class SingleStoreDB(VectorStore): vector_field (str, optional): Specifies the field to store the vector. Defaults to "vector". + use_vector_index (bool, optional): Toggles the use of a vector index. + Works only with SingleStoreDB 8.5 or later. Defaults to False. + If set to True, vector_size parameter is required to be set to + a proper value. + + vector_index_name (str, optional): Specifies the name of the vector index. + Defaults to empty. Will be ignored if use_vector_index is set to False. + + vector_index_options (dict, optional): Specifies the options for + the vector index. Defaults to {}. + Will be ignored if use_vector_index is set to False. The options are: + index_type (str, optional): Specifies the type of the index. + Defaults to IVF_PQFS. + For more options, please refer to the SingleStoreDB documentation: + https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/ + + vector_size (int, optional): Specifies the size of the vector. + Defaults to 1536. Required if use_vector_index is set to True. + Should be set to the same value as the size of the vectors + stored in the vector_field. + Following arguments pertain to the connection pool: pool_size (int, optional): Determines the number of active connections in @@ -177,6 +202,19 @@ class SingleStoreDB(VectorStore): os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db' vectorstore = SingleStoreDB(OpenAIEmbeddings()) + + Using vector index: + + .. code-block:: python + + from langchain_community.embeddings import OpenAIEmbeddings + from langchain_community.vectorstores import SingleStoreDB + + os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db' + vectorstore = SingleStoreDB( + OpenAIEmbeddings(), + use_vector_index=True, + ) """ self.embedding = embedding @@ -186,6 +224,12 @@ class SingleStoreDB(VectorStore): self.metadata_field = self._sanitize_input(metadata_field) self.vector_field = self._sanitize_input(vector_field) + self.use_vector_index = bool(use_vector_index) + self.vector_index_name = self._sanitize_input(vector_index_name) + self.vector_index_options = dict(vector_index_options or {}) + self.vector_index_options["metric_type"] = self.distance_strategy + self.vector_size = int(vector_size) + # Pass the rest of the kwargs to the connection. self.connection_kwargs = kwargs @@ -194,7 +238,7 @@ class SingleStoreDB(VectorStore): self.connection_kwargs["conn_attrs"] = dict() self.connection_kwargs["conn_attrs"]["_connector_name"] = "langchain python sdk" - self.connection_kwargs["conn_attrs"]["_connector_version"] = "1.0.1" + self.connection_kwargs["conn_attrs"]["_connector_version"] = "1.0.2" # Create connection pool. self.connection_pool = QueuePool( @@ -222,16 +266,38 @@ class SingleStoreDB(VectorStore): try: cur = conn.cursor() try: - cur.execute( - """CREATE TABLE IF NOT EXISTS {} - ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, - {} BLOB, {} JSON);""".format( - self.table_name, - self.content_field, - self.vector_field, - self.metadata_field, - ), - ) + if self.use_vector_index: + index_options = "" + if self.vector_index_options and len(self.vector_index_options) > 0: + index_options = "INDEX_OPTIONS '{}'".format( + json.dumps(self.vector_index_options) + ) + cur.execute( + """CREATE TABLE IF NOT EXISTS {} + ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, + {} VECTOR({}, F32) NOT NULL, {} JSON, + VECTOR INDEX {} ({}) {});""".format( + self.table_name, + self.content_field, + self.vector_field, + self.vector_size, + self.metadata_field, + self.vector_index_name, + self.vector_field, + index_options, + ), + ) + else: + cur.execute( + """CREATE TABLE IF NOT EXISTS {} + ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, + {} BLOB, {} JSON);""".format( + self.table_name, + self.content_field, + self.vector_field, + self.metadata_field, + ), + ) finally: cur.close() finally: @@ -279,6 +345,8 @@ class SingleStoreDB(VectorStore): json.dumps(metadata), ), ) + if self.use_vector_index: + cur.execute("OPTIMIZE TABLE {} FLUSH;".format(self.table_name)) finally: cur.close() finally: @@ -406,6 +474,10 @@ class SingleStoreDB(VectorStore): content_field: str = "content", metadata_field: str = "metadata", vector_field: str = "vector", + use_vector_index: bool = False, + vector_index_name: str = "", + vector_index_options: Optional[dict] = None, + vector_size: int = 1536, pool_size: int = 5, max_overflow: int = 10, timeout: float = 30, @@ -438,6 +510,10 @@ class SingleStoreDB(VectorStore): pool_size=pool_size, max_overflow=max_overflow, timeout=timeout, + use_vector_index=use_vector_index, + vector_index_name=vector_index_name, + vector_index_options=vector_index_options, + vector_size=vector_size, **kwargs, ) instance.add_texts(texts, metadatas, embedding.embed_documents(texts), **kwargs) diff --git a/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py b/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py index 4bd23acbe77..4f690f079fd 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py @@ -4,6 +4,7 @@ from typing import List import numpy as np import pytest from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings from langchain_community.vectorstores.singlestoredb import SingleStoreDB from langchain_community.vectorstores.utils import DistanceStrategy @@ -43,6 +44,16 @@ class NormilizedFakeEmbeddings(FakeEmbeddings): return self.normalize(super().embed_query(text)) +class RandomEmbeddings(Embeddings): + """Fake embeddings with random vectors. For testing purposes.""" + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + return [np.random.rand(100).tolist() for _ in texts] + + def embed_query(self, text: str) -> List[float]: + return np.random.rand(100).tolist() + + @pytest.fixture def texts() -> List[str]: return ["foo", "bar", "baz"] @@ -99,6 +110,66 @@ def test_singlestoredb_euclidean_distance(texts: List[str]) -> None: drop(table_name) +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_vector_index_1(texts: List[str]) -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_vector_index_1" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + texts, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + use_vector_index=True, + vector_size=10, + host=TEST_SINGLESTOREDB_URL, + ) + docsearch.add_texts(["foo"]) + output = docsearch.similarity_search("foo", k=2) + assert output == TEST_RESULT + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_vector_index_2(texts: List[str]) -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_vector_index_2" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + texts, + FakeEmbeddings(), + table_name=table_name, + use_vector_index=True, + vector_index_options={"index_type": "IVF_PQ", "nlist": 256}, + vector_size=10, + host=TEST_SINGLESTOREDB_URL, + ) + docsearch.add_texts(["foo"]) + output = docsearch.similarity_search("foo", k=1) + output[0].page_content == "foo" + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_vector_index_large() -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_vector_index_large" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + ["foo"] * 300000, + RandomEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + use_vector_index=True, + vector_size=100, + vector_index_name="vector_index_large", + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1) + assert output[0].page_content == "foo" + drop(table_name) + + @pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") def test_singlestoredb_from_existing(texts: List[str]) -> None: """Test adding a new document"""