From e36bc379f2ce65445d91f477b449922ce1ac0823 Mon Sep 17 00:00:00 2001 From: volodymyr-memsql <57520563+volodymyr-memsql@users.noreply.github.com> Date: Wed, 14 Feb 2024 21:43:12 +0200 Subject: [PATCH] community[patch]: Add vector index support to SingleStoreDB VectorStore (#17308) This pull request introduces support for various Approximate Nearest Neighbor (ANN) vector index algorithms in the VectorStore class, starting from version 8.5 of SingleStore DB. Leveraging this enhancement enables users to harness the power of vector indexing, significantly boosting search speed, particularly when handling large sets of vectors. --------- Co-authored-by: Volodymyr Tkachuk Co-authored-by: Bagatur --- .../vectorstores/singlestoredb.ipynb | 8 ++ .../vectorstores/singlestoredb.py | 98 ++++++++++++++++--- .../vectorstores/test_singlestoredb.py | 71 ++++++++++++++ 3 files changed, 166 insertions(+), 11 deletions(-) diff --git a/docs/docs/integrations/vectorstores/singlestoredb.ipynb b/docs/docs/integrations/vectorstores/singlestoredb.ipynb index fd20785ad45..6cae0d54424 100644 --- a/docs/docs/integrations/vectorstores/singlestoredb.ipynb +++ b/docs/docs/integrations/vectorstores/singlestoredb.ipynb @@ -106,6 +106,14 @@ "print(docs[0].page_content)" ] }, + { + "cell_type": "markdown", + "id": "035cba66", + "metadata": {}, + "source": [ + "Enhance your search efficiency with SingleStore DB version 8.5 or above by leveraging [ANN vector indexes](https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/). By setting `use_vector_index=True` during vector store object creation, you can activate this feature. Additionally, if your vectors differ in dimensionality from the default OpenAI embedding size of 1536, ensure to specify the `vector_size` parameter accordingly. " + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/libs/community/langchain_community/vectorstores/singlestoredb.py b/libs/community/langchain_community/vectorstores/singlestoredb.py index c33858e9657..6d43b6199b8 100644 --- a/libs/community/langchain_community/vectorstores/singlestoredb.py +++ b/libs/community/langchain_community/vectorstores/singlestoredb.py @@ -57,6 +57,10 @@ class SingleStoreDB(VectorStore): content_field: str = "content", metadata_field: str = "metadata", vector_field: str = "vector", + use_vector_index: bool = False, + vector_index_name: str = "", + vector_index_options: Optional[dict] = None, + vector_size: int = 1536, pool_size: int = 5, max_overflow: int = 10, timeout: float = 30, @@ -88,6 +92,27 @@ class SingleStoreDB(VectorStore): vector_field (str, optional): Specifies the field to store the vector. Defaults to "vector". + use_vector_index (bool, optional): Toggles the use of a vector index. + Works only with SingleStoreDB 8.5 or later. Defaults to False. + If set to True, vector_size parameter is required to be set to + a proper value. + + vector_index_name (str, optional): Specifies the name of the vector index. + Defaults to empty. Will be ignored if use_vector_index is set to False. + + vector_index_options (dict, optional): Specifies the options for + the vector index. Defaults to {}. + Will be ignored if use_vector_index is set to False. The options are: + index_type (str, optional): Specifies the type of the index. + Defaults to IVF_PQFS. + For more options, please refer to the SingleStoreDB documentation: + https://docs.singlestore.com/cloud/reference/sql-reference/vector-functions/vector-indexing/ + + vector_size (int, optional): Specifies the size of the vector. + Defaults to 1536. Required if use_vector_index is set to True. + Should be set to the same value as the size of the vectors + stored in the vector_field. + Following arguments pertain to the connection pool: pool_size (int, optional): Determines the number of active connections in @@ -177,6 +202,19 @@ class SingleStoreDB(VectorStore): os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db' vectorstore = SingleStoreDB(OpenAIEmbeddings()) + + Using vector index: + + .. code-block:: python + + from langchain_community.embeddings import OpenAIEmbeddings + from langchain_community.vectorstores import SingleStoreDB + + os.environ['SINGLESTOREDB_URL'] = 'me:p455w0rd@s2-host.com/my_db' + vectorstore = SingleStoreDB( + OpenAIEmbeddings(), + use_vector_index=True, + ) """ self.embedding = embedding @@ -186,6 +224,12 @@ class SingleStoreDB(VectorStore): self.metadata_field = self._sanitize_input(metadata_field) self.vector_field = self._sanitize_input(vector_field) + self.use_vector_index = bool(use_vector_index) + self.vector_index_name = self._sanitize_input(vector_index_name) + self.vector_index_options = dict(vector_index_options or {}) + self.vector_index_options["metric_type"] = self.distance_strategy + self.vector_size = int(vector_size) + # Pass the rest of the kwargs to the connection. self.connection_kwargs = kwargs @@ -194,7 +238,7 @@ class SingleStoreDB(VectorStore): self.connection_kwargs["conn_attrs"] = dict() self.connection_kwargs["conn_attrs"]["_connector_name"] = "langchain python sdk" - self.connection_kwargs["conn_attrs"]["_connector_version"] = "1.0.1" + self.connection_kwargs["conn_attrs"]["_connector_version"] = "1.0.2" # Create connection pool. self.connection_pool = QueuePool( @@ -222,16 +266,38 @@ class SingleStoreDB(VectorStore): try: cur = conn.cursor() try: - cur.execute( - """CREATE TABLE IF NOT EXISTS {} - ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, - {} BLOB, {} JSON);""".format( - self.table_name, - self.content_field, - self.vector_field, - self.metadata_field, - ), - ) + if self.use_vector_index: + index_options = "" + if self.vector_index_options and len(self.vector_index_options) > 0: + index_options = "INDEX_OPTIONS '{}'".format( + json.dumps(self.vector_index_options) + ) + cur.execute( + """CREATE TABLE IF NOT EXISTS {} + ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, + {} VECTOR({}, F32) NOT NULL, {} JSON, + VECTOR INDEX {} ({}) {});""".format( + self.table_name, + self.content_field, + self.vector_field, + self.vector_size, + self.metadata_field, + self.vector_index_name, + self.vector_field, + index_options, + ), + ) + else: + cur.execute( + """CREATE TABLE IF NOT EXISTS {} + ({} TEXT CHARACTER SET utf8mb4 COLLATE utf8mb4_general_ci, + {} BLOB, {} JSON);""".format( + self.table_name, + self.content_field, + self.vector_field, + self.metadata_field, + ), + ) finally: cur.close() finally: @@ -279,6 +345,8 @@ class SingleStoreDB(VectorStore): json.dumps(metadata), ), ) + if self.use_vector_index: + cur.execute("OPTIMIZE TABLE {} FLUSH;".format(self.table_name)) finally: cur.close() finally: @@ -406,6 +474,10 @@ class SingleStoreDB(VectorStore): content_field: str = "content", metadata_field: str = "metadata", vector_field: str = "vector", + use_vector_index: bool = False, + vector_index_name: str = "", + vector_index_options: Optional[dict] = None, + vector_size: int = 1536, pool_size: int = 5, max_overflow: int = 10, timeout: float = 30, @@ -438,6 +510,10 @@ class SingleStoreDB(VectorStore): pool_size=pool_size, max_overflow=max_overflow, timeout=timeout, + use_vector_index=use_vector_index, + vector_index_name=vector_index_name, + vector_index_options=vector_index_options, + vector_size=vector_size, **kwargs, ) instance.add_texts(texts, metadatas, embedding.embed_documents(texts), **kwargs) diff --git a/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py b/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py index 4bd23acbe77..4f690f079fd 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_singlestoredb.py @@ -4,6 +4,7 @@ from typing import List import numpy as np import pytest from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings from langchain_community.vectorstores.singlestoredb import SingleStoreDB from langchain_community.vectorstores.utils import DistanceStrategy @@ -43,6 +44,16 @@ class NormilizedFakeEmbeddings(FakeEmbeddings): return self.normalize(super().embed_query(text)) +class RandomEmbeddings(Embeddings): + """Fake embeddings with random vectors. For testing purposes.""" + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + return [np.random.rand(100).tolist() for _ in texts] + + def embed_query(self, text: str) -> List[float]: + return np.random.rand(100).tolist() + + @pytest.fixture def texts() -> List[str]: return ["foo", "bar", "baz"] @@ -99,6 +110,66 @@ def test_singlestoredb_euclidean_distance(texts: List[str]) -> None: drop(table_name) +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_vector_index_1(texts: List[str]) -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_vector_index_1" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + texts, + FakeEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + use_vector_index=True, + vector_size=10, + host=TEST_SINGLESTOREDB_URL, + ) + docsearch.add_texts(["foo"]) + output = docsearch.similarity_search("foo", k=2) + assert output == TEST_RESULT + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_vector_index_2(texts: List[str]) -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_vector_index_2" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + texts, + FakeEmbeddings(), + table_name=table_name, + use_vector_index=True, + vector_index_options={"index_type": "IVF_PQ", "nlist": 256}, + vector_size=10, + host=TEST_SINGLESTOREDB_URL, + ) + docsearch.add_texts(["foo"]) + output = docsearch.similarity_search("foo", k=1) + output[0].page_content == "foo" + drop(table_name) + + +@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") +def test_singlestoredb_vector_index_large() -> None: + """Test adding a new document""" + table_name = "test_singlestoredb_vector_index_large" + drop(table_name) + docsearch = SingleStoreDB.from_texts( + ["foo"] * 300000, + RandomEmbeddings(), + distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE, + table_name=table_name, + use_vector_index=True, + vector_size=100, + vector_index_name="vector_index_large", + host=TEST_SINGLESTOREDB_URL, + ) + output = docsearch.similarity_search("foo", k=1) + assert output[0].page_content == "foo" + drop(table_name) + + @pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed") def test_singlestoredb_from_existing(texts: List[str]) -> None: """Test adding a new document"""