community[patch]: update for compatibility with latest Meilisearch version (#18970)

- **Description:** Updates Meilisearch vectorstore for compatibility
with v1.6 and above. Adds embedders settings and embedder_name which are
now required.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
CaroFG 2024-03-27 22:08:27 +00:00 committed by GitHub
parent be2adb1083
commit cf96060ab7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 107 additions and 21 deletions

View File

@ -130,7 +130,14 @@
"from langchain_openai import OpenAIEmbeddings\n", "from langchain_openai import OpenAIEmbeddings\n",
"from langchain_text_splitters import CharacterTextSplitter\n", "from langchain_text_splitters import CharacterTextSplitter\n",
"\n", "\n",
"embeddings = OpenAIEmbeddings()" "embeddings = OpenAIEmbeddings()\n",
"embedders = {\n",
" \"default\": {\n",
" \"source\": \"userProvided\",\n",
" \"dimensions\": 1536,\n",
" }\n",
"}\n",
"embedder_name = \"default\""
] ]
}, },
{ {
@ -152,7 +159,9 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"# Use Meilisearch vector store to store texts & associated embeddings as vector\n", "# Use Meilisearch vector store to store texts & associated embeddings as vector\n",
"vector_store = Meilisearch.from_texts(texts=texts, embedding=embeddings)" "vector_store = Meilisearch.from_texts(\n",
" texts=texts, embedding=embeddings, embedders=embedders, embedder_name=embedder_name\n",
")"
] ]
}, },
{ {
@ -188,11 +197,16 @@
"docs = text_splitter.split_documents(documents)\n", "docs = text_splitter.split_documents(documents)\n",
"\n", "\n",
"# Import documents & embeddings in the vector store\n", "# Import documents & embeddings in the vector store\n",
"vector_store = Meilisearch.from_documents(documents=documents, embedding=embeddings)\n", "vector_store = Meilisearch.from_documents(\n",
" documents=documents,\n",
" embedding=embeddings,\n",
" embedders=embedders,\n",
" embedder_name=embedder_name,\n",
")\n",
"\n", "\n",
"# Search in our vector store\n", "# Search in our vector store\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = vector_store.similarity_search(query)\n", "docs = vector_store.similarity_search(query, embedder_name=embedder_name)\n",
"print(docs[0].page_content)" "print(docs[0].page_content)"
] ]
}, },
@ -221,7 +235,11 @@
"\n", "\n",
"client = meilisearch.Client(url=\"http://127.0.0.1:7700\", api_key=\"***\")\n", "client = meilisearch.Client(url=\"http://127.0.0.1:7700\", api_key=\"***\")\n",
"vector_store = Meilisearch(\n", "vector_store = Meilisearch(\n",
" embedding=embeddings, client=client, index_name=\"langchain_demo\", text_key=\"text\"\n", " embedding=embeddings,\n",
" embedders=embedders,\n",
" client=client,\n",
" index_name=\"langchain_demo\",\n",
" text_key=\"text\",\n",
")\n", ")\n",
"vector_store.add_documents(documents)" "vector_store.add_documents(documents)"
] ]
@ -232,7 +250,7 @@
"source": [ "source": [
"## Similarity Search with score\n", "## Similarity Search with score\n",
"\n", "\n",
"This specific method allows you to return the documents and the distance score of the query to them." "This specific method allows you to return the documents and the distance score of the query to them. `embedder_name` is the name of the embedder that should be used for semantic search, defaults to \"default\"."
] ]
}, },
{ {
@ -241,7 +259,9 @@
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"docs_and_scores = vector_store.similarity_search_with_score(query)\n", "docs_and_scores = vector_store.similarity_search_with_score(\n",
" query, embedder_name=embedder_name\n",
")\n",
"docs_and_scores[0]" "docs_and_scores[0]"
] ]
}, },
@ -249,7 +269,8 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Similarity Search by vector" "## Similarity Search by vector\n",
"`embedder_name` is the name of the embedder that should be used for semantic search, defaults to \"default\"."
] ]
}, },
{ {
@ -259,7 +280,9 @@
"outputs": [], "outputs": [],
"source": [ "source": [
"embedding_vector = embeddings.embed_query(query)\n", "embedding_vector = embeddings.embed_query(query)\n",
"docs_and_scores = vector_store.similarity_search_by_vector(embedding_vector)\n", "docs_and_scores = vector_store.similarity_search_by_vector(\n",
" embedding_vector, embedder_name=embedder_name\n",
")\n",
"docs_and_scores[0]" "docs_and_scores[0]"
] ]
}, },

View File

@ -65,8 +65,15 @@ class Meilisearch(VectorStore):
# api_key is optional; provide it if your meilisearch instance requires it # api_key is optional; provide it if your meilisearch instance requires it
client = meilisearch.Client(url='http://127.0.0.1:7700', api_key='***') client = meilisearch.Client(url='http://127.0.0.1:7700', api_key='***')
embeddings = OpenAIEmbeddings() embeddings = OpenAIEmbeddings()
embedders = {
"theEmbedderName": {
"source": "userProvided",
"dimensions": "1536"
}
}
vectorstore = Meilisearch( vectorstore = Meilisearch(
embedding=embeddings, embedding=embeddings,
embedders=embedders,
client=client, client=client,
index_name='langchain_demo', index_name='langchain_demo',
text_key='text') text_key='text')
@ -81,6 +88,8 @@ class Meilisearch(VectorStore):
index_name: str = "langchain-demo", index_name: str = "langchain-demo",
text_key: str = "text", text_key: str = "text",
metadata_key: str = "metadata", metadata_key: str = "metadata",
*,
embedders: Optional[Dict[str, Any]] = None,
): ):
"""Initialize with Meilisearch client.""" """Initialize with Meilisearch client."""
client = _create_client(client=client, url=url, api_key=api_key) client = _create_client(client=client, url=url, api_key=api_key)
@ -90,18 +99,24 @@ class Meilisearch(VectorStore):
self._embedding = embedding self._embedding = embedding
self._text_key = text_key self._text_key = text_key
self._metadata_key = metadata_key self._metadata_key = metadata_key
self._embedders = embedders
self._embedders_settings = self._client.index(
str(self._index_name)
).update_embedders(embedders)
def add_texts( def add_texts(
self, self,
texts: Iterable[str], texts: Iterable[str],
metadatas: Optional[List[dict]] = None, metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None, ids: Optional[List[str]] = None,
embedder_name: Optional[str] = "default",
**kwargs: Any, **kwargs: Any,
) -> List[str]: ) -> List[str]:
"""Run more texts through the embedding and add them to the vector store. """Run more texts through the embedding and add them to the vector store.
Args: Args:
texts (Iterable[str]): Iterable of strings/text to add to the vectorstore. texts (Iterable[str]): Iterable of strings/text to add to the vectorstore.
embedder_name: Name of the embedder. Defaults to "default".
metadatas (Optional[List[dict]]): Optional list of metadata. metadatas (Optional[List[dict]]): Optional list of metadata.
Defaults to None. Defaults to None.
ids Optional[List[str]]: Optional list of IDs. ids Optional[List[str]]: Optional list of IDs.
@ -128,7 +143,7 @@ class Meilisearch(VectorStore):
docs.append( docs.append(
{ {
"id": id, "id": id,
"_vectors": embedding, "_vectors": {f"{embedder_name}": embedding},
f"{self._metadata_key}": metadata, f"{self._metadata_key}": metadata,
} }
) )
@ -142,12 +157,14 @@ class Meilisearch(VectorStore):
query: str, query: str,
k: int = 4, k: int = 4,
filter: Optional[Dict[str, str]] = None, filter: Optional[Dict[str, str]] = None,
embedder_name: Optional[str] = "default",
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return meilisearch documents most similar to the query. """Return meilisearch documents most similar to the query.
Args: Args:
query (str): Query text for which to find similar documents. query (str): Query text for which to find similar documents.
embedder_name: Name of the embedder to be used. Defaults to "default".
k (int): Number of documents to return. Defaults to 4. k (int): Number of documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None. Defaults to None.
@ -158,6 +175,7 @@ class Meilisearch(VectorStore):
""" """
docs_and_scores = self.similarity_search_with_score( docs_and_scores = self.similarity_search_with_score(
query=query, query=query,
embedder_name=embedder_name,
k=k, k=k,
filter=filter, filter=filter,
kwargs=kwargs, kwargs=kwargs,
@ -169,12 +187,14 @@ class Meilisearch(VectorStore):
query: str, query: str,
k: int = 4, k: int = 4,
filter: Optional[Dict[str, str]] = None, filter: Optional[Dict[str, str]] = None,
embedder_name: Optional[str] = "default",
**kwargs: Any, **kwargs: Any,
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
"""Return meilisearch documents most similar to the query, along with scores. """Return meilisearch documents most similar to the query, along with scores.
Args: Args:
query (str): Query text for which to find similar documents. query (str): Query text for which to find similar documents.
embedder_name: Name of the embedder to be used. Defaults to "default".
k (int): Number of documents to return. Defaults to 4. k (int): Number of documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None. Defaults to None.
@ -187,6 +207,7 @@ class Meilisearch(VectorStore):
docs = self.similarity_search_by_vector_with_scores( docs = self.similarity_search_by_vector_with_scores(
embedding=_query, embedding=_query,
embedder_name=embedder_name,
k=k, k=k,
filter=filter, filter=filter,
kwargs=kwargs, kwargs=kwargs,
@ -196,6 +217,7 @@ class Meilisearch(VectorStore):
def similarity_search_by_vector_with_scores( def similarity_search_by_vector_with_scores(
self, self,
embedding: List[float], embedding: List[float],
embedder_name: Optional[str] = "default",
k: int = 4, k: int = 4,
filter: Optional[Dict[str, Any]] = None, filter: Optional[Dict[str, Any]] = None,
**kwargs: Any, **kwargs: Any,
@ -204,6 +226,7 @@ class Meilisearch(VectorStore):
Args: Args:
embedding (List[float]): Embedding to look up similar documents. embedding (List[float]): Embedding to look up similar documents.
embedder_name: Name of the embedder to be used. Defaults to "default".
k (int): Number of documents to return. Defaults to 4. k (int): Number of documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None. Defaults to None.
@ -214,7 +237,13 @@ class Meilisearch(VectorStore):
""" """
docs = [] docs = []
results = self._client.index(str(self._index_name)).search( results = self._client.index(str(self._index_name)).search(
"", {"vector": embedding, "limit": k, "filter": filter} "",
{
"vector": embedding,
"hybrid": {"semanticRatio": 1.0, "embedder": embedder_name},
"limit": k,
"filter": filter,
},
) )
for result in results["hits"]: for result in results["hits"]:
@ -233,12 +262,14 @@ class Meilisearch(VectorStore):
embedding: List[float], embedding: List[float],
k: int = 4, k: int = 4,
filter: Optional[Dict[str, str]] = None, filter: Optional[Dict[str, str]] = None,
embedder_name: Optional[str] = "default",
**kwargs: Any, **kwargs: Any,
) -> List[Document]: ) -> List[Document]:
"""Return meilisearch documents most similar to embedding vector. """Return meilisearch documents most similar to embedding vector.
Args: Args:
embedding (List[float]): Embedding to look up similar documents. embedding (List[float]): Embedding to look up similar documents.
embedder_name: Name of the embedder to be used. Defaults to "default".
k (int): Number of documents to return. Defaults to 4. k (int): Number of documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata. filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None. Defaults to None.
@ -249,6 +280,7 @@ class Meilisearch(VectorStore):
""" """
docs = self.similarity_search_by_vector_with_scores( docs = self.similarity_search_by_vector_with_scores(
embedding=embedding, embedding=embedding,
embedder_name=embedder_name,
k=k, k=k,
filter=filter, filter=filter,
kwargs=kwargs, kwargs=kwargs,
@ -268,6 +300,8 @@ class Meilisearch(VectorStore):
ids: Optional[List[str]] = None, ids: Optional[List[str]] = None,
text_key: Optional[str] = "text", text_key: Optional[str] = "text",
metadata_key: Optional[str] = "metadata", metadata_key: Optional[str] = "metadata",
embedders: Dict[str, Any] = {},
embedder_name: Optional[str] = "default",
**kwargs: Any, **kwargs: Any,
) -> Meilisearch: ) -> Meilisearch:
"""Construct Meilisearch wrapper from raw documents. """Construct Meilisearch wrapper from raw documents.
@ -288,21 +322,25 @@ class Meilisearch(VectorStore):
# The environment should be the one specified next to the API key # The environment should be the one specified next to the API key
# in your Meilisearch console # in your Meilisearch console
client = meilisearch.Client(url='http://127.0.0.1:7700', api_key='***') client = meilisearch.Client(url='http://127.0.0.1:7700', api_key='***')
embeddings = OpenAIEmbeddings() embedding = OpenAIEmbeddings()
embedders: Embedders index setting.
embedder_name: Name of the embedder. Defaults to "default".
docsearch = Meilisearch.from_texts( docsearch = Meilisearch.from_texts(
client=client, client=client,
embeddings=embeddings, embedding=embedding,
) )
""" """
client = _create_client(client=client, url=url, api_key=api_key) client = _create_client(client=client, url=url, api_key=api_key)
vectorstore = cls( vectorstore = cls(
embedding=embedding, embedding=embedding,
embedders=embedders,
client=client, client=client,
index_name=index_name, index_name=index_name,
) )
vectorstore.add_texts( vectorstore.add_texts(
texts=texts, texts=texts,
embedder_name=embedder_name,
metadatas=metadatas, metadatas=metadatas,
ids=ids, ids=ids,
text_key=text_key, text_key=text_key,

View File

@ -1,5 +1,6 @@
"""Test Meilisearch functionality.""" """Test Meilisearch functionality."""
from typing import TYPE_CHECKING, Generator
from typing import TYPE_CHECKING, Any, Dict, Generator
import pytest import pytest
import requests import requests
@ -33,6 +34,16 @@ class TestMeilisearchVectorSearch:
timeout=10, timeout=10,
) )
@pytest.fixture
def new_embedders(self) -> Dict[str, Dict[str, Any]]:
return {
"default": {
"source": "userProvided",
# Dimension defined in FakeEmbeddings as [float(1.0)] * 9 + [float(0.0)]
"dimensions": 10,
}
}
@pytest.fixture(autouse=True) @pytest.fixture(autouse=True)
def setup(self) -> None: def setup(self) -> None:
self.delete_all_indexes() self.delete_all_indexes()
@ -63,12 +74,14 @@ class TestMeilisearchVectorSearch:
# Wait for the last task to be completed # Wait for the last task to be completed
client.wait_for_task(tasks.results[0].uid) client.wait_for_task(tasks.results[0].uid)
def test_meilisearch(self) -> None: def test_meilisearch(self, new_embedders: Dict[str, Any]) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
vectorstore = Meilisearch.from_texts( vectorstore = Meilisearch.from_texts(
texts=texts, texts=texts,
embedding=FakeEmbeddings(), embedding=FakeEmbeddings(),
embedders=new_embedders,
embedder_name=list(new_embedders)[0],
url=TEST_MEILI_HTTP_ADDR, url=TEST_MEILI_HTTP_ADDR,
api_key=TEST_MEILI_MASTER_KEY, api_key=TEST_MEILI_MASTER_KEY,
index_name=INDEX_NAME, index_name=INDEX_NAME,
@ -77,12 +90,14 @@ class TestMeilisearchVectorSearch:
output = vectorstore.similarity_search("foo", k=1) output = vectorstore.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")] assert output == [Document(page_content="foo")]
def test_meilisearch_with_client(self) -> None: def test_meilisearch_with_client(self, new_embedders: Dict[str, Any]) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
vectorstore = Meilisearch.from_texts( vectorstore = Meilisearch.from_texts(
texts=texts, texts=texts,
embedding=FakeEmbeddings(), embedding=FakeEmbeddings(),
embedders=new_embedders,
embedder_name=list(new_embedders)[0],
client=self.client(), client=self.client(),
index_name=INDEX_NAME, index_name=INDEX_NAME,
) )
@ -90,13 +105,15 @@ class TestMeilisearchVectorSearch:
output = vectorstore.similarity_search("foo", k=1) output = vectorstore.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")] assert output == [Document(page_content="foo")]
def test_meilisearch_with_metadatas(self) -> None: def test_meilisearch_with_metadatas(self, new_embedders: Dict[str, Any]) -> None:
"""Test end to end construction and search.""" """Test end to end construction and search."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))] metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Meilisearch.from_texts( docsearch = Meilisearch.from_texts(
texts=texts, texts=texts,
embedding=FakeEmbeddings(), embedding=FakeEmbeddings(),
embedders=new_embedders,
embedder_name=list(new_embedders)[0],
url=TEST_MEILI_HTTP_ADDR, url=TEST_MEILI_HTTP_ADDR,
api_key=TEST_MEILI_MASTER_KEY, api_key=TEST_MEILI_MASTER_KEY,
index_name=INDEX_NAME, index_name=INDEX_NAME,
@ -109,13 +126,17 @@ class TestMeilisearchVectorSearch:
assert output[0].metadata["page"] == 0 assert output[0].metadata["page"] == 0
assert output == [Document(page_content="foo", metadata={"page": 0})] assert output == [Document(page_content="foo", metadata={"page": 0})]
def test_meilisearch_with_metadatas_with_scores(self) -> None: def test_meilisearch_with_metadatas_with_scores(
self, new_embedders: Dict[str, Any]
) -> None:
"""Test end to end construction and scored search.""" """Test end to end construction and scored search."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))] metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = Meilisearch.from_texts( docsearch = Meilisearch.from_texts(
texts=texts, texts=texts,
embedding=FakeEmbeddings(), embedding=FakeEmbeddings(),
embedders=new_embedders,
embedder_name=list(new_embedders)[0],
url=TEST_MEILI_HTTP_ADDR, url=TEST_MEILI_HTTP_ADDR,
api_key=TEST_MEILI_MASTER_KEY, api_key=TEST_MEILI_MASTER_KEY,
index_name=INDEX_NAME, index_name=INDEX_NAME,
@ -123,9 +144,11 @@ class TestMeilisearchVectorSearch:
) )
self._wait_last_task() self._wait_last_task()
output = docsearch.similarity_search_with_score("foo", k=1) output = docsearch.similarity_search_with_score("foo", k=1)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 9.0)] assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)]
def test_meilisearch_with_metadatas_with_scores_using_vector(self) -> None: def test_meilisearch_with_metadatas_with_scores_using_vector(
self, new_embedders: Dict[str, Any]
) -> None:
"""Test end to end construction and scored search, using embedding vector.""" """Test end to end construction and scored search, using embedding vector."""
texts = ["foo", "bar", "baz"] texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))] metadatas = [{"page": str(i)} for i in range(len(texts))]
@ -134,6 +157,8 @@ class TestMeilisearchVectorSearch:
docsearch = Meilisearch.from_texts( docsearch = Meilisearch.from_texts(
texts=texts, texts=texts,
embedding=FakeEmbeddings(), embedding=FakeEmbeddings(),
embedders=new_embedders,
embedder_name=list(new_embedders)[0],
url=TEST_MEILI_HTTP_ADDR, url=TEST_MEILI_HTTP_ADDR,
api_key=TEST_MEILI_MASTER_KEY, api_key=TEST_MEILI_MASTER_KEY,
index_name=INDEX_NAME, index_name=INDEX_NAME,
@ -144,4 +169,4 @@ class TestMeilisearchVectorSearch:
output = docsearch.similarity_search_by_vector_with_scores( output = docsearch.similarity_search_by_vector_with_scores(
embedding=embedded_query, k=1 embedding=embedded_query, k=1
) )
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 9.0)] assert output == [(Document(page_content="foo", metadata={"page": "0"}), 1.0)]