community[patch]: Added add_images method to SingleStoreDB vector store (#17871)

In this pull request, we introduce the add_images method to the
SingleStoreDB vector store class, expanding its capabilities to handle
multi-modal embeddings seamlessly. This method facilitates the
incorporation of image data into the vector store by associating each
image's URI with corresponding document content, metadata, and either
pre-generated embeddings or embeddings computed using the embed_image
method of the provided embedding object.

the change includes integration tests, validating the behavior of the
add_images. Additionally, we provide a notebook showcasing the usage of
this new method.

---------

Co-authored-by: Volodymyr Tkachuk <vtkachuk-ua@singlestore.com>
This commit is contained in:
volodymyr-memsql
2024-02-22 01:16:32 +02:00
committed by GitHub
parent 7735721929
commit 0a9a519a39
3 changed files with 138 additions and 4 deletions

View File

@@ -303,6 +303,35 @@ class SingleStoreDB(VectorStore):
finally:
conn.close()
def add_images(
self,
uris: List[str],
metadatas: Optional[List[dict]] = None,
embeddings: Optional[List[List[float]]] = None,
**kwargs: Any,
) -> List[str]:
"""Run images through the embeddings and add to the vectorstore.
Args:
uris List[str]: File path to images.
Each URI will be added to the vectorstore as document content.
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
Defaults to None.
embeddings (Optional[List[List[float]]], optional): Optional pre-generated
embeddings. Defaults to None.
Returns:
List[str]: empty list
"""
# Set embeddings
if (
embeddings is None
and self.embedding is not None
and hasattr(self.embedding, "embed_image")
):
embeddings = self.embedding.embed_image(uris=uris)
return self.add_texts(uris, metadatas, embeddings, **kwargs)
def add_texts(
self,
texts: Iterable[str],

View File

@@ -1,4 +1,6 @@
"""Test SingleStoreDB functionality."""
import os
import tempfile
from typing import List
import numpy as np
@@ -14,6 +16,7 @@ TEST_SINGLESTOREDB_URL = "root:pass@localhost:3306/db"
TEST_SINGLE_RESULT = [Document(page_content="foo")]
TEST_SINGLE_WITH_METADATA_RESULT = [Document(page_content="foo", metadata={"a": "b"})]
TEST_RESULT = [Document(page_content="foo"), Document(page_content="foo")]
TEST_IMAGES_DIR = ""
try:
import singlestoredb as s2
@@ -22,6 +25,13 @@ try:
except ImportError:
singlestoredb_installed = False
try:
from langchain_experimental.open_clip import OpenCLIPEmbeddings
langchain_experimental_installed = True
except ImportError:
langchain_experimental_installed = False
def drop(table_name: str) -> None:
with s2.connect(TEST_SINGLESTOREDB_URL) as conn:
@@ -53,6 +63,9 @@ class RandomEmbeddings(Embeddings):
def embed_query(self, text: str) -> List[float]:
return np.random.rand(100).tolist()
def embed_image(self, uris: List[str]) -> List[List[float]]:
return [np.random.rand(100).tolist() for _ in uris]
@pytest.fixture
def texts() -> List[str]:
@@ -156,7 +169,7 @@ def test_singlestoredb_vector_index_large() -> None:
table_name = "test_singlestoredb_vector_index_large"
drop(table_name)
docsearch = SingleStoreDB.from_texts(
["foo"] * 300000,
["foo"] * 30,
RandomEmbeddings(),
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
@@ -444,3 +457,51 @@ def test_singlestoredb_as_retriever(texts: List[str]) -> None:
),
]
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_add_image(texts: List[str]) -> None:
"""Test adding images"""
table_name = "test_singlestoredb_add_image"
drop(table_name)
docsearch = SingleStoreDB(
RandomEmbeddings(),
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
temp_files = []
for _ in range(3):
temp_file = tempfile.NamedTemporaryFile(delete=False)
temp_file.write(b"foo")
temp_file.close()
temp_files.append(temp_file.name)
docsearch.add_images(temp_files)
output = docsearch.similarity_search("foo", k=1)
assert output[0].page_content in temp_files
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
@pytest.mark.skipif(
not langchain_experimental_installed, reason="langchain_experimental not installed"
)
def test_singestoredb_add_image2() -> None:
table_name = "test_singlestoredb_add_images"
drop(table_name)
docsearch = SingleStoreDB(
OpenCLIPEmbeddings(),
table_name=table_name,
host=TEST_SINGLESTOREDB_URL,
)
image_uris = sorted(
[
os.path.join(TEST_IMAGES_DIR, image_name)
for image_name in os.listdir(TEST_IMAGES_DIR)
if image_name.endswith(".jpg")
]
)
docsearch.add_images(image_uris)
output = docsearch.similarity_search("horse", k=1)
assert "horse" in output[0].page_content
drop(table_name)