community[patch]: add hybrid search to singlestoredb vectorstore (#20793)

Implemented the ability to enable full-text search within the
SingleStore vector store, offering users a versatile range of search
strategies. This enhancement allows users to seamlessly combine
full-text search with vector search, enabling the following search
strategies:

* Search solely by vector similarity.
* Conduct searches exclusively based on text similarity, utilizing
Lucene internally.
* Filter search results by text similarity score, with the option to
specify a threshold, followed by a search based on vector similarity.
* Filter results by vector similarity score before conducting a search
based on text similarity.
* Perform searches using a weighted sum of vector and text similarity
scores.

Additionally, integration tests have been added to comprehensively cover
all scenarios.
Updated notebook with examples.

CC: @baskaryan, @hwchase17

---------

Co-authored-by: Volodymyr Tkachuk <vtkachuk-ua@singlestore.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
volodymyr-memsql
2024-04-25 00:34:50 +03:00
committed by GitHub
parent 9efab3ed66
commit 493afe4d8d
3 changed files with 879 additions and 53 deletions

View File

@@ -1,4 +1,5 @@
"""Test SingleStoreDB functionality."""
import math
import os
import tempfile
from typing import List
@@ -67,11 +68,76 @@ class RandomEmbeddings(Embeddings):
return [np.random.rand(100).tolist() for _ in uris]
class IncrementalEmbeddings(Embeddings):
"""Fake embeddings with incremental vectors. For testing purposes."""
def __init__(self) -> None:
self.counter = 0
def set_counter(self, counter: int) -> None:
self.counter = counter
def embed_query(self, text: str) -> List[float]:
self.counter += 1
return [
math.cos(self.counter * math.pi / 10),
math.sin(self.counter * math.pi / 10),
]
def embed_documents(self, texts: List[str]) -> List[List[float]]:
return [self.embed_query(text) for text in texts]
@pytest.fixture
def texts() -> List[str]:
return ["foo", "bar", "baz"]
@pytest.fixture
def snow_rain_docs() -> List[Document]:
return [
Document(
page_content="""In the parched desert, a sudden rainstorm brought relief,
as the droplets danced upon the thirsty earth, rejuvenating the landscape
with the sweet scent of petrichor.""",
metadata={"count": "1", "category": "rain", "group": "a"},
),
Document(
page_content="""Amidst the bustling cityscape, the rain fell relentlessly,
creating a symphony of pitter-patter on the pavement, while umbrellas
bloomed like colorful flowers in a sea of gray.""",
metadata={"count": "2", "category": "rain", "group": "a"},
),
Document(
page_content="""High in the mountains, the rain transformed into a delicate
mist, enveloping the peaks in a mystical veil, where each droplet seemed to
whisper secrets to the ancient rocks below.""",
metadata={"count": "3", "category": "rain", "group": "b"},
),
Document(
page_content="""Blanketing the countryside in a soft, pristine layer, the
snowfall painted a serene tableau, muffling the world in a tranquil hush
as delicate flakes settled upon the branches of trees like nature's own
lacework.""",
metadata={"count": "1", "category": "snow", "group": "b"},
),
Document(
page_content="""In the urban landscape, snow descended, transforming
bustling streets into a winter wonderland, where the laughter of
children echoed amidst the flurry of snowballs and the twinkle of
holiday lights.""",
metadata={"count": "2", "category": "snow", "group": "a"},
),
Document(
page_content="""Atop the rugged peaks, snow fell with an unyielding
intensity, sculpting the landscape into a pristine alpine paradise,
where the frozen crystals shimmered under the moonlight, casting a
spell of enchantment over the wilderness below.""",
metadata={"count": "3", "category": "snow", "group": "a"},
),
]
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb(texts: List[str]) -> None:
"""Test end to end construction and search."""
@@ -505,3 +571,184 @@ def test_singestoredb_add_image2() -> None:
output = docsearch.similarity_search("horse", k=1)
assert "horse" in output[0].page_content
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_text_only_search(snow_rain_docs: List[Document]) -> None:
table_name = "test_singlestoredb_text_only_search"
drop(table_name)
docsearch = SingleStoreDB(
RandomEmbeddings(),
table_name=table_name,
use_full_text_search=True,
host=TEST_SINGLESTOREDB_URL,
)
docsearch.add_documents(snow_rain_docs)
output = docsearch.similarity_search(
"rainstorm in parched desert",
k=3,
filter={"count": "1"},
search_strategy=SingleStoreDB.SearchStrategy.TEXT_ONLY,
)
assert len(output) == 2
assert (
"In the parched desert, a sudden rainstorm brought relief,"
in output[0].page_content
)
assert (
"Blanketing the countryside in a soft, pristine layer" in output[1].page_content
)
output = docsearch.similarity_search(
"snowfall in countryside",
k=3,
search_strategy=SingleStoreDB.SearchStrategy.TEXT_ONLY,
)
assert len(output) == 3
assert (
"Blanketing the countryside in a soft, pristine layer,"
in output[0].page_content
)
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_by_text_search(snow_rain_docs: List[Document]) -> None:
table_name = "test_singlestoredb_filter_by_text_search"
drop(table_name)
embeddings = IncrementalEmbeddings()
docsearch = SingleStoreDB.from_documents(
snow_rain_docs,
embeddings,
table_name=table_name,
use_full_text_search=True,
use_vector_index=True,
vector_size=2,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"rainstorm in parched desert",
k=1,
search_strategy=SingleStoreDB.SearchStrategy.FILTER_BY_TEXT,
filter_threshold=0,
)
assert len(output) == 1
assert (
"In the parched desert, a sudden rainstorm brought relief"
in output[0].page_content
)
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_by_vector_search1(snow_rain_docs: List[Document]) -> None:
table_name = "test_singlestoredb_filter_by_vector_search1"
drop(table_name)
embeddings = IncrementalEmbeddings()
docsearch = SingleStoreDB.from_documents(
snow_rain_docs,
embeddings,
table_name=table_name,
use_full_text_search=True,
use_vector_index=True,
vector_size=2,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"rainstorm in parched desert, rain",
k=1,
filter={"category": "rain"},
search_strategy=SingleStoreDB.SearchStrategy.FILTER_BY_VECTOR,
filter_threshold=-0.2,
)
assert len(output) == 1
assert (
"High in the mountains, the rain transformed into a delicate"
in output[0].page_content
)
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_filter_by_vector_search2(snow_rain_docs: List[Document]) -> None:
table_name = "test_singlestoredb_filter_by_vector_search2"
drop(table_name)
embeddings = IncrementalEmbeddings()
docsearch = SingleStoreDB.from_documents(
snow_rain_docs,
embeddings,
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
table_name=table_name,
use_full_text_search=True,
use_vector_index=True,
vector_size=2,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"rainstorm in parched desert, rain",
k=1,
filter={"group": "a"},
search_strategy=SingleStoreDB.SearchStrategy.FILTER_BY_VECTOR,
filter_threshold=1.5,
)
assert len(output) == 1
assert (
"Amidst the bustling cityscape, the rain fell relentlessly"
in output[0].page_content
)
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_weighted_sum_search_unsupported_strategy(
snow_rain_docs: List[Document],
) -> None:
table_name = "test_singlestoredb_waighted_sum_search_unsupported_strategy"
drop(table_name)
embeddings = IncrementalEmbeddings()
docsearch = SingleStoreDB.from_documents(
snow_rain_docs,
embeddings,
table_name=table_name,
use_full_text_search=True,
use_vector_index=True,
vector_size=2,
host=TEST_SINGLESTOREDB_URL,
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
)
try:
docsearch.similarity_search(
"rainstorm in parched desert, rain",
k=1,
search_strategy=SingleStoreDB.SearchStrategy.WEIGHTED_SUM,
)
except ValueError as e:
assert "Search strategy WEIGHTED_SUM is not" in str(e)
drop(table_name)
@pytest.mark.skipif(not singlestoredb_installed, reason="singlestoredb not installed")
def test_singlestoredb_weighted_sum_search(snow_rain_docs: List[Document]) -> None:
table_name = "test_singlestoredb_waighted_sum_search"
drop(table_name)
embeddings = IncrementalEmbeddings()
docsearch = SingleStoreDB.from_documents(
snow_rain_docs,
embeddings,
table_name=table_name,
use_full_text_search=True,
use_vector_index=True,
vector_size=2,
host=TEST_SINGLESTOREDB_URL,
)
output = docsearch.similarity_search(
"rainstorm in parched desert, rain",
k=1,
search_strategy=SingleStoreDB.SearchStrategy.WEIGHTED_SUM,
filter={"category": "snow"},
)
assert len(output) == 1
assert (
"Atop the rugged peaks, snow fell with an unyielding" in output[0].page_content
)
drop(table_name)