community[minor]: ManticoreSearch engine added to vectorstore (#19117)

**Description:** ManticoreSearch engine added to vectorstores **Issue:** no issue, just a new feature **Dependencies:** https://pypi.org/project/manticoresearch-dev/ **Twitter handle:** @EvilFreelancer - Example notebook with test integration: https://github.com/EvilFreelancer/langchain/blob/manticore-search-vectorstore/docs/docs/integrations/vectorstores/manticore_search.ipynb --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Chester Curme <chester.curme@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2025-09-08 22:42:05 +00:00 · 2024-05-23 23:56:18 +03:00
parent 95c3e5f85f
commit fe26f937e4
5 changed files with 826 additions and 106 deletions
--- a/libs/community/langchain_community/vectorstores/init.py
+++ b/libs/community/langchain_community/vectorstores/init.py
@@ -153,6 +153,10 @@ if TYPE_CHECKING:
    from langchain_community.vectorstores.llm_rails import (
        LLMRails,
    )
+    from langchain_community.vectorstores.manticore_search import (
+        ManticoreSearch,
+        ManticoreSearchSettings,
+    )
    from langchain_community.vectorstores.marqo import (
        Marqo,
    )
@@ -341,6 +345,8 @@ __all__ = [
    "LLMRails",
    "LanceDB",
    "Lantern",
+    "ManticoreSearch",
+    "ManticoreSearchSettings",
    "Marqo",
    "MatchingEngine",
    "Meilisearch",
@@ -439,6 +445,8 @@ _module_lookup = {
    "LLMRails": "langchain_community.vectorstores.llm_rails",
    "LanceDB": "langchain_community.vectorstores.lancedb",
    "Lantern": "langchain_community.vectorstores.lantern",
+    "ManticoreSearch": "langchain_community.vectorstores.manticore_search",
+    "ManticoreSearchSettings": "langchain_community.vectorstores.manticore_search",
    "Marqo": "langchain_community.vectorstores.marqo",
    "MatchingEngine": "langchain_community.vectorstores.matching_engine",
    "Meilisearch": "langchain_community.vectorstores.meilisearch",
--- a/libs/community/langchain_community/vectorstores/manticore_search.py
+++ b/libs/community/langchain_community/vectorstores/manticore_search.py
@@ -0,0 +1,372 @@
+from __future__ import annotations
+
+import json
+import logging
+import uuid
+from hashlib import sha1
+from typing import Any, Dict, Iterable, List, Optional, Type
+
+from langchain_core.documents import Document
+from langchain_core.embeddings import Embeddings
+from langchain_core.pydantic_v1 import BaseSettings
+from langchain_core.vectorstores import VectorStore
+
+logger = logging.getLogger()
+DEFAULT_K = 4  # Number of Documents to return.
+
+
+class ManticoreSearchSettings(BaseSettings):
+    proto: str = "http"
+    host: str = "localhost"
+    port: int = 9308
+
+    username: Optional[str] = None
+    password: Optional[str] = None
+
+    # database: str = "Manticore"
+    table: str = "langchain"
+
+    column_map: Dict[str, str] = {
+        "id": "id",
+        "uuid": "uuid",
+        "document": "document",
+        "embedding": "embedding",
+        "metadata": "metadata",
+    }
+
+    # A mandatory setting; currently, only hnsw is supported.
+    knn_type: str = "hnsw"
+
+    # A mandatory setting that specifies the dimensions of the vectors being indexed.
+    knn_dims: Optional[int] = None  # Defaults autodetect
+
+    # A mandatory setting that specifies the distance function used by the HNSW index.
+    hnsw_similarity: str = "L2"  # Acceptable values are: L2, IP, COSINE
+
+    # An optional setting that defines the maximum amount of outgoing connections
+    # in the graph.
+    hnsw_m: int = 16  # The default is 16.
+
+    # An optional setting that defines a construction time/accuracy trade-off.
+    hnsw_ef_construction = 100
+
+    def get_connection_string(self) -> str:
+        return self.proto + "://" + self.host + ":" + str(self.port)
+
+    def __getitem__(self, item: str) -> Any:
+        return getattr(self, item)
+
+    class Config:
+        env_file = ".env"
+        env_prefix = "manticore_"
+        env_file_encoding = "utf-8"
+
+
+class ManticoreSearch(VectorStore):
+    """
+    `ManticoreSearch Engine` vector store.
+
+    To use, you should have the ``manticoresearch`` python package installed.
+
+    Example:
+        .. code-block:: python
+
+                from langchain_community.vectorstores import Manticore
+                from langchain_community.embeddings.openai import OpenAIEmbeddings
+
+                embeddings = OpenAIEmbeddings()
+                vectorstore = ManticoreSearch(embeddings)
+    """
+
+    def __init__(
+        self,
+        embedding: Embeddings,
+        *,
+        config: Optional[ManticoreSearchSettings] = None,
+        **kwargs: Any,
+    ) -> None:
+        """
+        ManticoreSearch Wrapper to LangChain
+
+        Args:
+            embedding (Embeddings): Text embedding model.
+            config (ManticoreSearchSettings): Configuration of ManticoreSearch Client
+            **kwargs: Other keyword arguments will pass into Configuration of API client
+                manticoresearch-python. See
+                https://github.com/manticoresoftware/manticoresearch-python for more.
+        """
+        try:
+            import manticoresearch.api as ENDPOINTS
+            import manticoresearch.api_client as API
+        except ImportError:
+            raise ImportError(
+                "Could not import manticoresearch python package. "
+                "Please install it with `pip install manticoresearch-dev`."
+            )
+
+        try:
+            from tqdm import tqdm
+
+            self.pgbar = tqdm
+        except ImportError:
+            # Just in case if tqdm is not installed
+            self.pgbar = lambda x, **kwargs: x
+
+        super().__init__()
+
+        self.embedding = embedding
+        if config is not None:
+            self.config = config
+        else:
+            self.config = ManticoreSearchSettings()
+
+        assert self.config
+        assert self.config.host and self.config.port
+        assert (
+            self.config.column_map
+            # and self.config.database
+            and self.config.table
+        )
+
+        assert (
+            self.config.knn_type
+            # and self.config.knn_dims
+            # and self.config.hnsw_m
+            # and self.config.hnsw_ef_construction
+            and self.config.hnsw_similarity
+        )
+
+        for k in ["id", "embedding", "document", "metadata", "uuid"]:
+            assert k in self.config.column_map
+
+        # Detect embeddings dimension
+        if self.config.knn_dims is None:
+            self.dim: int = len(self.embedding.embed_query("test"))
+        else:
+            self.dim = self.config.knn_dims
+
+        # Initialize the schema
+        self.schema = f"""\
+CREATE TABLE IF NOT EXISTS {self.config.table}(
+    {self.config.column_map['id']} bigint,
+    {self.config.column_map['document']} text indexed stored,
+    {self.config.column_map['embedding']} \
+        float_vector knn_type='{self.config.knn_type}' \
+        knn_dims='{self.dim}' \
+        hnsw_similarity='{self.config.hnsw_similarity}' \
+        hnsw_m='{self.config.hnsw_m}' \
+        hnsw_ef_construction='{self.config.hnsw_ef_construction}',
+    {self.config.column_map['metadata']} json,
+    {self.config.column_map['uuid']} text indexed stored
+)\
+"""
+
+        # Create a connection to ManticoreSearch
+        self.configuration = API.Configuration(
+            host=self.config.get_connection_string(),
+            username=self.config.username,
+            password=self.config.password,
+            # disabled_client_side_validations=",",
+            **kwargs,
+        )
+        self.connection = API.ApiClient(self.configuration)
+        self.client = {
+            "index": ENDPOINTS.IndexApi(self.connection),
+            "utils": ENDPOINTS.UtilsApi(self.connection),
+            "search": ENDPOINTS.SearchApi(self.connection),
+        }
+
+        # Create default schema if not exists
+        self.client["utils"].sql(self.schema)
+
+    @property
+    def embeddings(self) -> Embeddings:
+        return self.embedding
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        *,
+        batch_size: int = 32,
+        text_ids: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """
+        Insert more texts through the embeddings and add to the VectorStore.
+
+        Args:
+            texts: Iterable of strings to add to the VectorStore
+            metadata: Optional column data to be inserted
+            batch_size: Batch size of insertion
+            ids: Optional list of ids to associate with the texts
+
+        Returns:
+            List of ids from adding the texts into the VectorStore.
+        """
+        # Embed and create the documents
+        ids = text_ids or [
+            # See https://stackoverflow.com/questions/67219691/python-hash-function-that-returns-32-or-64-bits
+            str(int(sha1(t.encode("utf-8")).hexdigest()[:15], 16))
+            for t in texts
+        ]
+        transac = []
+        for i, text in enumerate(texts):
+            embed = self.embeddings.embed_query(text)
+            doc_uuid = str(uuid.uuid1())
+            doc = {
+                self.config.column_map["document"]: text,
+                self.config.column_map["embedding"]: embed,
+                self.config.column_map["metadata"]: metadatas[i] if metadatas else {},
+                self.config.column_map["uuid"]: doc_uuid,
+            }
+            transac.append(
+                {"replace": {"index": self.config.table, "id": ids[i], "doc": doc}}
+            )
+
+            if len(transac) == batch_size:
+                body = "\n".join(map(json.dumps, transac))
+                try:
+                    self.client["index"].bulk(body)
+                    transac = []
+                except Exception as e:
+                    logger.info(f"Error indexing documents: {e}")
+
+        if len(transac) > 0:
+            body = "\n".join(map(json.dumps, transac))
+            try:
+                self.client["index"].bulk(body)
+            except Exception as e:
+                logger.info(f"Error indexing documents: {e}")
+
+        return ids
+
+    @classmethod
+    def from_texts(
+        cls: Type[ManticoreSearch],
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[Dict[Any, Any]]] = None,
+        *,
+        config: Optional[ManticoreSearchSettings] = None,
+        text_ids: Optional[List[str]] = None,
+        batch_size: int = 32,
+        **kwargs: Any,
+    ) -> ManticoreSearch:
+        ctx = cls(embedding, config=config, **kwargs)
+        ctx.add_texts(
+            texts=texts,
+            embedding=embedding,
+            text_ids=text_ids,
+            batch_size=batch_size,
+            metadatas=metadatas,
+            **kwargs,
+        )
+        return ctx
+
+    @classmethod
+    def from_documents(
+        cls: Type[ManticoreSearch],
+        documents: List[Document],
+        embedding: Embeddings,
+        *,
+        config: Optional[ManticoreSearchSettings] = None,
+        text_ids: Optional[List[str]] = None,
+        batch_size: int = 32,
+        **kwargs: Any,
+    ) -> ManticoreSearch:
+        texts = [doc.page_content for doc in documents]
+        metadatas = [doc.metadata for doc in documents]
+        return cls.from_texts(
+            texts=texts,
+            embedding=embedding,
+            text_ids=text_ids,
+            batch_size=batch_size,
+            metadatas=metadatas,
+            **kwargs,
+        )
+
+    def __repr__(self) -> str:
+        """
+        Text representation for ManticoreSearch Vector Store, prints backends, username
+        and schemas. Easy to use with `str(ManticoreSearch())`
+
+        Returns:
+            repr: string to show connection info and data schema
+        """
+        _repr = f"\033[92m\033[1m{self.config.table} @ "
+        _repr += f"http://{self.config.host}:{self.config.port}\033[0m\n\n"
+        _repr += f"\033[1musername: {self.config.username}\033[0m\n\nTable Schema:\n"
+        _repr += "-" * 51 + "\n"
+        for r in self.client["utils"].sql(f"DESCRIBE {self.config.table}")[0]["data"]:
+            _repr += (
+                f"|\033[94m{r['Field']:24s}\033[0m|\033["
+                f"96m{r['Type'] + ' ' + r['Properties']:24s}\033[0m|\n"
+            )
+        _repr += "-" * 51 + "\n"
+        return _repr
+
+    def similarity_search(
+        self, query: str, k: int = DEFAULT_K, **kwargs: Any
+    ) -> List[Document]:
+        """Perform a similarity search with ManticoreSearch
+
+        Args:
+            query (str): query string
+            k (int, optional): Top K neighbors to retrieve. Defaults to 4.
+
+        Returns:
+            List[Document]: List of Documents
+        """
+        return self.similarity_search_by_vector(
+            self.embedding.embed_query(query), k, **kwargs
+        )
+
+    def similarity_search_by_vector(
+        self,
+        embedding: List[float],
+        k: int = DEFAULT_K,
+        **kwargs: Any,
+    ) -> List[Document]:
+        """Perform a similarity search with ManticoreSearch by vectors
+
+        Args:
+            embedding (List[float]): Embedding vector
+            k (int, optional): Top K neighbors to retrieve. Defaults to 4.
+
+        Returns:
+            List[Document]: List of documents
+        """
+
+        # Build search request
+        request = {
+            "index": self.config.table,
+            "knn": {
+                "field": self.config.column_map["embedding"],
+                "k": k,
+                "query_vector": embedding,
+            },
+        }
+
+        # Execute request and convert response to langchain.Document format
+        try:
+            return [
+                Document(
+                    page_content=r["_source"][self.config.column_map["document"]],
+                    metadata=r["_source"][self.config.column_map["metadata"]],
+                )
+                for r in self.client["search"].search(request, **kwargs).hits.hits[:k]
+            ]
+        except Exception as e:
+            logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
+            return []
+
+    def drop(self) -> None:
+        """
+        Helper function: Drop data
+        """
+        self.client["utils"].sql(f"DROP TABLE IF EXISTS {self.config.table}")
+
+    @property
+    def metadata_column(self) -> str:
+        return self.config.column_map["metadata"]
--- a/libs/community/tests/unit_tests/vectorstores/test_imports.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_imports.py
@@ -50,6 +50,8 @@ EXPECTED_ALL = [
    "LLMRails",
    "LanceDB",
    "Lantern",
+    "ManticoreSearch",
+    "ManticoreSearchSettings",
    "Marqo",
    "MatchingEngine",
    "Meilisearch",
@@ -112,6 +114,7 @@ def test_all_imports_exclusive() -> None:
            "PathwayVectorClient",
            "DistanceStrategy",
            "KineticaSettings",
+            "ManticoreSearchSettings",
        ]:
            assert issubclass(getattr(vectorstores, cls), VectorStore)

--- a/libs/community/tests/unit_tests/vectorstores/test_public_api.py
+++ b/libs/community/tests/unit_tests/vectorstores/test_public_api.py
@@ -1,106 +0,0 @@
-"""Test the public API of the tools package."""
-from langchain_community.vectorstores import __all__ as public_api
-
-_EXPECTED = [
-    "Aerospike",
-    "AlibabaCloudOpenSearch",
-    "AlibabaCloudOpenSearchSettings",
-    "AnalyticDB",
-    "Annoy",
-    "ApacheDoris",
-    "AtlasDB",
-    "AwaDB",
-    "AzureSearch",
-    "Bagel",
-    "BaiduVectorDB",
-    "BESVectorStore",
-    "BigQueryVectorSearch",
-    "Cassandra",
-    "AstraDB",
-    "Chroma",
-    "Clarifai",
-    "Clickhouse",
-    "ClickhouseSettings",
-    "DashVector",
-    "DatabricksVectorSearch",
-    "DeepLake",
-    "Dingo",
-    "DistanceStrategy",
-    "DocArrayHnswSearch",
-    "DocArrayInMemorySearch",
-    "DocumentDBVectorSearch",
-    "DuckDB",
-    "EcloudESVectorStore",
-    "ElasticKnnSearch",
-    "ElasticVectorSearch",
-    "ElasticsearchStore",
-    "Epsilla",
-    "FAISS",
-    "HanaDB",
-    "Hologres",
-    "InfinispanVS",
-    "InMemoryVectorStore",
-    "KDBAI",
-    "Kinetica",
-    "KineticaSettings",
-    "LanceDB",
-    "Lantern",
-    "LLMRails",
-    "Marqo",
-    "MatchingEngine",
-    "Meilisearch",
-    "Milvus",
-    "MomentoVectorIndex",
-    "MongoDBAtlasVectorSearch",
-    "MyScale",
-    "MyScaleSettings",
-    "Neo4jVector",
-    "OpenSearchVectorSearch",
-    "OracleVS",
-    "PGEmbedding",
-    "PGVector",
-    "PathwayVectorClient",
-    "Pinecone",
-    "Qdrant",
-    "Redis",
-    "Relyt",
-    "Rockset",
-    "SKLearnVectorStore",
-    "ScaNN",
-    "SemaDB",
-    "SingleStoreDB",
-    "SQLiteVSS",
-    "StarRocks",
-    "SupabaseVectorStore",
-    "SurrealDBStore",
-    "Tair",
-    "TiDBVectorStore",
-    "TileDB",
-    "Tigris",
-    "TimescaleVector",
-    "Typesense",
-    "UpstashVectorStore",
-    "USearch",
-    "Vald",
-    "VDMS",
-    "Vearch",
-    "Vectara",
-    "VespaStore",
-    "VLite",
-    "Weaviate",
-    "ZepVectorStore",
-    "Zilliz",
-    "TencentVectorDB",
-    "AzureCosmosDBVectorSearch",
-    "VectorStore",
-    "Yellowbrick",
-    "NeuralDBClientVectorStore",
-    "NeuralDBVectorStore",
-    "CouchbaseVectorStore",
-]
-
-
-def test_public_api() -> None:
-    """Test for regressions or changes in the public API."""
-    # Check that the public API is as expected
-    assert set(public_api) == set(_EXPECTED)