Feature: Add support for meilisearch vectorstore (#7649)

**Description:**

Add support for Meilisearch vector store.
Resolve #7603 

- No external dependencies added
- A notebook has been added

@rlancemartin

https://twitter.com/meilisearch

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Amélie
2023-07-29 02:06:54 +02:00
committed by GitHub
parent b7d6e1909c
commit 8ee56b9a5b
5 changed files with 780 additions and 0 deletions

View File

@@ -24,6 +24,7 @@ from langchain.vectorstores.hologres import Hologres
from langchain.vectorstores.lancedb import LanceDB
from langchain.vectorstores.marqo import Marqo
from langchain.vectorstores.matching_engine import MatchingEngine
from langchain.vectorstores.meilisearch import Meilisearch
from langchain.vectorstores.milvus import Milvus
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
from langchain.vectorstores.myscale import MyScale, MyScaleSettings
@@ -68,6 +69,7 @@ __all__ = [
"LanceDB",
"MatchingEngine",
"Marqo",
"Meilisearch",
"Milvus",
"Zilliz",
"SingleStoreDB",

View File

@@ -0,0 +1,312 @@
"""Wrapper around Meilisearch vector database."""
from __future__ import annotations
import uuid
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.utils import get_from_env
from langchain.vectorstores.base import VectorStore
if TYPE_CHECKING:
from meilisearch import Client
def _create_client(
client: Optional[Client] = None,
url: Optional[str] = None,
api_key: Optional[str] = None,
) -> Client:
try:
import meilisearch
except ImportError:
raise ValueError(
"Could not import meilisearch python package. "
"Please install it with `pip install meilisearch`."
)
if not client:
url = url or get_from_env("url", "MEILI_HTTP_ADDR")
try:
api_key = api_key or get_from_env("api_key", "MEILI_MASTER_KEY")
except Exception:
pass
client = meilisearch.Client(url=url, api_key=api_key)
elif not isinstance(client, meilisearch.Client):
raise ValueError(
f"client should be an instance of meilisearch.Client, "
f"got {type(client)}"
)
try:
client.version()
except ValueError as e:
raise ValueError(f"Failed to connect to Meilisearch: {e}")
return client
class Meilisearch(VectorStore):
"""Initialize wrapper around Meilisearch vector database.
To use this, you need to have `meilisearch` python package installed,
and a running Meilisearch instance.
To learn more about Meilisearch Python, refer to the in-depth
Meilisearch Python documentation: https://meilisearch.github.io/meilisearch-python/.
See the following documentation for how to run a Meilisearch instance:
https://www.meilisearch.com/docs/learn/getting_started/quick_start.
Example:
.. code-block:: python
from langchain.vectorstores import Meilisearch
from langchain.embeddings.openai import OpenAIEmbeddings
import meilisearch
# api_key is optional; provide it if your meilisearch instance requires it
client = meilisearch.Client(url='http://127.0.0.1:7700', api_key='***')
embeddings = OpenAIEmbeddings()
vectorstore = Meilisearch(
embedding=embeddings,
client=client,
index_name='langchain_demo',
text_key='text')
"""
def __init__(
self,
embedding: Embeddings,
client: Optional[Client] = None,
url: Optional[str] = None,
api_key: Optional[str] = None,
index_name: str = "langchain-demo",
text_key: str = "text",
metadata_key: str = "metadata",
):
"""Initialize with Meilisearch client."""
client = _create_client(client=client, url=url, api_key=api_key)
self._client = client
self._index_name = index_name
self._embedding = embedding
self._text_key = text_key
self._metadata_key = metadata_key
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None,
**kwargs: Any,
) -> List[str]:
"""Run more texts through the embedding and add them to the vector store.
Args:
texts (Iterable[str]): Iterable of strings/text to add to the vectorstore.
metadatas (Optional[List[dict]]): Optional list of metadata.
Defaults to None.
ids Optional[List[str]]: Optional list of IDs.
Defaults to None.
Returns:
List[str]: List of IDs of the texts added to the vectorstore.
"""
texts = list(texts)
# Embed and create the documents
docs = []
if ids is None:
ids = [uuid.uuid4().hex for _ in texts]
if metadatas is None:
metadatas = [{} for _ in texts]
embedding_vectors = self._embedding.embed_documents(texts)
for i, text in enumerate(texts):
id = ids[i]
metadata = metadatas[i]
metadata[self._text_key] = text
embedding = embedding_vectors[i]
docs.append(
{
"id": id,
"_vectors": embedding,
f"{self._metadata_key}": metadata,
}
)
# Send to Meilisearch
self._client.index(str(self._index_name)).add_documents(docs)
return ids
def similarity_search(
self,
query: str,
k: int = 4,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return meilisearch documents most similar to the query.
Args:
query (str): Query text for which to find similar documents.
k (int): Number of documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None.
Returns:
List[Document]: List of Documents most similar to the query
text and score for each.
"""
docs_and_scores = self.similarity_search_with_score(
query=query,
k=k,
filter=filter,
kwargs=kwargs,
)
return [doc for doc, _ in docs_and_scores]
def similarity_search_with_score(
self,
query: str,
k: int = 4,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return meilisearch documents most similar to the query, along with scores.
Args:
query (str): Query text for which to find similar documents.
k (int): Number of documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None.
Returns:
List[Document]: List of Documents most similar to the query
text and score for each.
"""
_query = self._embedding.embed_query(query)
docs = self.similarity_search_by_vector_with_scores(
embedding=_query,
k=k,
filter=filter,
kwargs=kwargs,
)
return docs
def similarity_search_by_vector_with_scores(
self,
embedding: List[float],
k: int = 4,
filter: Optional[Dict[str, Any]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
"""Return meilisearch documents most similar to embedding vector.
Args:
embedding (List[float]): Embedding to look up similar documents.
k (int): Number of documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None.
Returns:
List[Document]: List of Documents most similar to the query
vector and score for each.
"""
docs = []
results = self._client.index(str(self._index_name)).search(
"", {"vector": embedding, "limit": k, "filter": filter}
)
for result in results["hits"]:
metadata = result[self._metadata_key]
if self._text_key in metadata:
text = metadata.pop(self._text_key)
semantic_score = result["_semanticScore"]
docs.append(
(Document(page_content=text, metadata=metadata), semantic_score)
)
return docs
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[Dict[str, str]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return meilisearch documents most similar to embedding vector.
Args:
embedding (List[float]): Embedding to look up similar documents.
k (int): Number of documents to return. Defaults to 4.
filter (Optional[Dict[str, str]]): Filter by metadata.
Defaults to None.
Returns:
List[Document]: List of Documents most similar to the query
vector and score for each.
"""
docs = self.similarity_search_by_vector_with_scores(
embedding=embedding,
k=k,
filter=filter,
kwargs=kwargs,
)
return [doc for doc, _ in docs]
@classmethod
def from_texts(
cls: Type[Meilisearch],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
client: Optional[Client] = None,
url: Optional[str] = None,
api_key: Optional[str] = None,
index_name: str = "langchain-demo",
ids: Optional[List[str]] = None,
text_key: Optional[str] = "text",
metadata_key: Optional[str] = "metadata",
**kwargs: Any,
) -> Meilisearch:
"""Construct Meilisearch wrapper from raw documents.
This is a user-friendly interface that:
1. Embeds documents.
2. Adds the documents to a provided Meilisearch index.
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain import Meilisearch
from langchain.embeddings import OpenAIEmbeddings
import meilisearch
# The environment should be the one specified next to the API key
# in your Meilisearch console
client = meilisearch.Client(url='http://127.0.0.1:7700', api_key='***')
embeddings = OpenAIEmbeddings()
docsearch = Meilisearch.from_texts(
client=client,
embeddings=embeddings,
)
"""
client = _create_client(client=client, url=url, api_key=api_key)
vectorstore = cls(
embedding=embedding,
client=client,
index_name=index_name,
)
vectorstore.add_texts(
texts=texts,
metadatas=metadatas,
ids=ids,
text_key=text_key,
metadata_key=metadata_key,
)
return vectorstore

View File

@@ -0,0 +1,17 @@
version: "3.8"
services:
meilisearch:
image: getmeili/meilisearch:latest
environment:
- MEILI_MASTER_KEY=${MEILI_MASTER_KEY:-masterKey}
- MEILI_NO_ANALYTICS=${MEILI_NO_ANALYTICS:-true}
- MEILI_ENV=${MEILI_ENV:-development}
ports:
- ${MEILI_PORT:-7700}:7700
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:7700"]
interval: 10s
timeout: 5s
retries: 5

View File

@@ -0,0 +1,143 @@
"""Test Meilisearch functionality."""
from typing import Generator
import meilisearch
import pytest
import requests
from langchain.docstore.document import Document
from langchain.vectorstores import Meilisearch
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
INDEX_NAME = "test-langchain-demo"
TEST_MEILI_HTTP_ADDR = "http://localhost:7700"
TEST_MEILI_MASTER_KEY = "masterKey"
class TestMeilisearchVectorSearch:
@pytest.fixture(scope="class", autouse=True)
def enable_vector_search(self) -> Generator[str, None, None]:
requests.patch(
f"{TEST_MEILI_HTTP_ADDR}/experimental-features",
headers={"Authorization": f"Bearer {TEST_MEILI_MASTER_KEY}"},
json={"vectorStore": True},
timeout=10,
)
yield "done"
requests.patch(
f"{TEST_MEILI_HTTP_ADDR}/experimental-features",
headers={"Authorization": f"Bearer {TEST_MEILI_MASTER_KEY}"},
json={"vectorStore": False},
timeout=10,
)
@pytest.fixture(autouse=True)
def setup(self) -> None:
self.delete_all_indexes()
@pytest.fixture(scope="class", autouse=True)
def teardown_test(self) -> Generator[str, None, None]:
# Yields back to the test function.
yield "done"
self.delete_all_indexes()
def delete_all_indexes(self) -> None:
client = self.client()
# Deletes all the indexes in the Meilisearch instance.
indexes = client.get_indexes()
for index in indexes["results"]:
task = client.index(index.uid).delete()
client.wait_for_task(task.task_uid)
def client(self) -> meilisearch.Client:
return meilisearch.Client(TEST_MEILI_HTTP_ADDR, TEST_MEILI_MASTER_KEY)
def _wait_last_task(self) -> None:
client = self.client()
# Get the last task
tasks = client.get_tasks()
# Wait for the last task to be completed
client.wait_for_task(tasks.results[0].uid)
def test_meilisearch(self) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
vectorstore = Meilisearch.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
url=TEST_MEILI_HTTP_ADDR,
api_key=TEST_MEILI_MASTER_KEY,
index_name=INDEX_NAME,
)
self._wait_last_task()
output = vectorstore.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_meilisearch_with_client(self) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
vectorstore = Meilisearch.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
client=self.client(),
index_name=INDEX_NAME,
)
self._wait_last_task()
output = vectorstore.similarity_search("foo", k=1)
assert output == [Document(page_content="foo")]
def test_meilisearch_with_metadatas(self) -> None:
"""Test end to end construction and search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = Meilisearch.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
url=TEST_MEILI_HTTP_ADDR,
api_key=TEST_MEILI_MASTER_KEY,
index_name=INDEX_NAME,
metadatas=metadatas,
)
self._wait_last_task()
output = docsearch.similarity_search("foo", k=1)
assert len(output) == 1
assert output[0].page_content == "foo"
assert output[0].metadata["page"] == 0
assert output == [Document(page_content="foo", metadata={"page": 0})]
def test_meilisearch_with_metadatas_with_scores(self) -> None:
"""Test end to end construction and scored search."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
docsearch = Meilisearch.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
url=TEST_MEILI_HTTP_ADDR,
api_key=TEST_MEILI_MASTER_KEY,
index_name=INDEX_NAME,
metadatas=metadatas,
)
self._wait_last_task()
output = docsearch.similarity_search_with_score("foo", k=1)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 9.0)]
def test_meilisearch_with_metadatas_with_scores_using_vector(self) -> None:
"""Test end to end construction and scored search, using embedding vector."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": str(i)} for i in range(len(texts))]
embeddings = FakeEmbeddings()
docsearch = Meilisearch.from_texts(
texts=texts,
embedding=FakeEmbeddings(),
url=TEST_MEILI_HTTP_ADDR,
api_key=TEST_MEILI_MASTER_KEY,
index_name=INDEX_NAME,
metadatas=metadatas,
)
embedded_query = embeddings.embed_query("foo")
self._wait_last_task()
output = docsearch.similarity_search_by_vector_with_scores(
embedding=embedded_query, k=1
)
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 9.0)]