LanceDB integration update (#22869)

Added : 

- [x] relevance search (w/wo scores)
- [x] maximal marginal search
- [x] image ingestion
- [x] filtering support
- [x] hybrid search w reranking 

make test, lint_diff and format checked.
This commit is contained in:
Raghav Dixit
2024-06-18 04:54:26 +01:00
committed by GitHub
parent 62c8a67f56
commit 55705c0f5e
3 changed files with 793 additions and 142 deletions

View File

@@ -11,7 +11,7 @@ def import_lancedb() -> Any:
import lancedb
except ImportError as e:
raise ImportError(
"Could not import pinecone lancedb package. "
"Could not import lancedb package. "
"Please install it with `pip install lancedb`."
) from e
return lancedb
@@ -56,3 +56,51 @@ def test_lancedb_add_texts() -> None:
result = store.similarity_search("text 2")
result_texts = [doc.page_content for doc in result]
assert "text 2" in result_texts
@pytest.mark.requires("lancedb")
def test_mmr() -> None:
embeddings = FakeEmbeddings()
store = LanceDB(embedding=embeddings)
store.add_texts(["text 1", "text 2", "item 3"])
result = store.max_marginal_relevance_search(query="text")
result_texts = [doc.page_content for doc in result]
assert "text 1" in result_texts
result = store.max_marginal_relevance_search_by_vector(
embeddings.embed_query("text")
)
result_texts = [doc.page_content for doc in result]
assert "text 1" in result_texts
@pytest.mark.requires("lancedb")
def test_lancedb_delete() -> None:
embeddings = FakeEmbeddings()
store = LanceDB(embedding=embeddings)
store.add_texts(["text 1", "text 2", "item 3"])
store.delete(filter="text = 'text 1'")
assert store.get_table().count_rows() == 2
@pytest.mark.requires("lancedb")
def test_lancedb_all_searches() -> None:
embeddings = FakeEmbeddings()
store = LanceDB(embedding=embeddings)
store.add_texts(["text 1", "text 2", "item 3"])
result_1 = store.similarity_search_with_relevance_scores(
"text 1", distance="cosine"
)
assert len(result_1[0]) == 2
assert "text 1" in result_1[0][0].page_content
result_2 = store.similarity_search_by_vector(embeddings.embed_query("text 1"))
assert "text 1" in result_2[0].page_content
result_3 = store.similarity_search_by_vector_with_relevance_scores(
embeddings.embed_query("text 1")
)
assert len(result_3[0]) == 2 # type: ignore
assert "text 1" in result_3[0][0].page_content # type: ignore