mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-08 22:42:05 +00:00
community[minor]: ManticoreSearch engine added to vectorstore (#19117)
**Description:** ManticoreSearch engine added to vectorstores **Issue:** no issue, just a new feature **Dependencies:** https://pypi.org/project/manticoresearch-dev/ **Twitter handle:** @EvilFreelancer - Example notebook with test integration: https://github.com/EvilFreelancer/langchain/blob/manticore-search-vectorstore/docs/docs/integrations/vectorstores/manticore_search.ipynb --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Harrison Chase <hw.chase.17@gmail.com> Co-authored-by: Chester Curme <chester.curme@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
@@ -153,6 +153,10 @@ if TYPE_CHECKING:
|
||||
from langchain_community.vectorstores.llm_rails import (
|
||||
LLMRails,
|
||||
)
|
||||
from langchain_community.vectorstores.manticore_search import (
|
||||
ManticoreSearch,
|
||||
ManticoreSearchSettings,
|
||||
)
|
||||
from langchain_community.vectorstores.marqo import (
|
||||
Marqo,
|
||||
)
|
||||
@@ -341,6 +345,8 @@ __all__ = [
|
||||
"LLMRails",
|
||||
"LanceDB",
|
||||
"Lantern",
|
||||
"ManticoreSearch",
|
||||
"ManticoreSearchSettings",
|
||||
"Marqo",
|
||||
"MatchingEngine",
|
||||
"Meilisearch",
|
||||
@@ -439,6 +445,8 @@ _module_lookup = {
|
||||
"LLMRails": "langchain_community.vectorstores.llm_rails",
|
||||
"LanceDB": "langchain_community.vectorstores.lancedb",
|
||||
"Lantern": "langchain_community.vectorstores.lantern",
|
||||
"ManticoreSearch": "langchain_community.vectorstores.manticore_search",
|
||||
"ManticoreSearchSettings": "langchain_community.vectorstores.manticore_search",
|
||||
"Marqo": "langchain_community.vectorstores.marqo",
|
||||
"MatchingEngine": "langchain_community.vectorstores.matching_engine",
|
||||
"Meilisearch": "langchain_community.vectorstores.meilisearch",
|
||||
|
@@ -0,0 +1,372 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import logging
|
||||
import uuid
|
||||
from hashlib import sha1
|
||||
from typing import Any, Dict, Iterable, List, Optional, Type
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.pydantic_v1 import BaseSettings
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
logger = logging.getLogger()
|
||||
DEFAULT_K = 4 # Number of Documents to return.
|
||||
|
||||
|
||||
class ManticoreSearchSettings(BaseSettings):
|
||||
proto: str = "http"
|
||||
host: str = "localhost"
|
||||
port: int = 9308
|
||||
|
||||
username: Optional[str] = None
|
||||
password: Optional[str] = None
|
||||
|
||||
# database: str = "Manticore"
|
||||
table: str = "langchain"
|
||||
|
||||
column_map: Dict[str, str] = {
|
||||
"id": "id",
|
||||
"uuid": "uuid",
|
||||
"document": "document",
|
||||
"embedding": "embedding",
|
||||
"metadata": "metadata",
|
||||
}
|
||||
|
||||
# A mandatory setting; currently, only hnsw is supported.
|
||||
knn_type: str = "hnsw"
|
||||
|
||||
# A mandatory setting that specifies the dimensions of the vectors being indexed.
|
||||
knn_dims: Optional[int] = None # Defaults autodetect
|
||||
|
||||
# A mandatory setting that specifies the distance function used by the HNSW index.
|
||||
hnsw_similarity: str = "L2" # Acceptable values are: L2, IP, COSINE
|
||||
|
||||
# An optional setting that defines the maximum amount of outgoing connections
|
||||
# in the graph.
|
||||
hnsw_m: int = 16 # The default is 16.
|
||||
|
||||
# An optional setting that defines a construction time/accuracy trade-off.
|
||||
hnsw_ef_construction = 100
|
||||
|
||||
def get_connection_string(self) -> str:
|
||||
return self.proto + "://" + self.host + ":" + str(self.port)
|
||||
|
||||
def __getitem__(self, item: str) -> Any:
|
||||
return getattr(self, item)
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
env_prefix = "manticore_"
|
||||
env_file_encoding = "utf-8"
|
||||
|
||||
|
||||
class ManticoreSearch(VectorStore):
|
||||
"""
|
||||
`ManticoreSearch Engine` vector store.
|
||||
|
||||
To use, you should have the ``manticoresearch`` python package installed.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_community.vectorstores import Manticore
|
||||
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
||||
|
||||
embeddings = OpenAIEmbeddings()
|
||||
vectorstore = ManticoreSearch(embeddings)
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
config: Optional[ManticoreSearchSettings] = None,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""
|
||||
ManticoreSearch Wrapper to LangChain
|
||||
|
||||
Args:
|
||||
embedding (Embeddings): Text embedding model.
|
||||
config (ManticoreSearchSettings): Configuration of ManticoreSearch Client
|
||||
**kwargs: Other keyword arguments will pass into Configuration of API client
|
||||
manticoresearch-python. See
|
||||
https://github.com/manticoresoftware/manticoresearch-python for more.
|
||||
"""
|
||||
try:
|
||||
import manticoresearch.api as ENDPOINTS
|
||||
import manticoresearch.api_client as API
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"Could not import manticoresearch python package. "
|
||||
"Please install it with `pip install manticoresearch-dev`."
|
||||
)
|
||||
|
||||
try:
|
||||
from tqdm import tqdm
|
||||
|
||||
self.pgbar = tqdm
|
||||
except ImportError:
|
||||
# Just in case if tqdm is not installed
|
||||
self.pgbar = lambda x, **kwargs: x
|
||||
|
||||
super().__init__()
|
||||
|
||||
self.embedding = embedding
|
||||
if config is not None:
|
||||
self.config = config
|
||||
else:
|
||||
self.config = ManticoreSearchSettings()
|
||||
|
||||
assert self.config
|
||||
assert self.config.host and self.config.port
|
||||
assert (
|
||||
self.config.column_map
|
||||
# and self.config.database
|
||||
and self.config.table
|
||||
)
|
||||
|
||||
assert (
|
||||
self.config.knn_type
|
||||
# and self.config.knn_dims
|
||||
# and self.config.hnsw_m
|
||||
# and self.config.hnsw_ef_construction
|
||||
and self.config.hnsw_similarity
|
||||
)
|
||||
|
||||
for k in ["id", "embedding", "document", "metadata", "uuid"]:
|
||||
assert k in self.config.column_map
|
||||
|
||||
# Detect embeddings dimension
|
||||
if self.config.knn_dims is None:
|
||||
self.dim: int = len(self.embedding.embed_query("test"))
|
||||
else:
|
||||
self.dim = self.config.knn_dims
|
||||
|
||||
# Initialize the schema
|
||||
self.schema = f"""\
|
||||
CREATE TABLE IF NOT EXISTS {self.config.table}(
|
||||
{self.config.column_map['id']} bigint,
|
||||
{self.config.column_map['document']} text indexed stored,
|
||||
{self.config.column_map['embedding']} \
|
||||
float_vector knn_type='{self.config.knn_type}' \
|
||||
knn_dims='{self.dim}' \
|
||||
hnsw_similarity='{self.config.hnsw_similarity}' \
|
||||
hnsw_m='{self.config.hnsw_m}' \
|
||||
hnsw_ef_construction='{self.config.hnsw_ef_construction}',
|
||||
{self.config.column_map['metadata']} json,
|
||||
{self.config.column_map['uuid']} text indexed stored
|
||||
)\
|
||||
"""
|
||||
|
||||
# Create a connection to ManticoreSearch
|
||||
self.configuration = API.Configuration(
|
||||
host=self.config.get_connection_string(),
|
||||
username=self.config.username,
|
||||
password=self.config.password,
|
||||
# disabled_client_side_validations=",",
|
||||
**kwargs,
|
||||
)
|
||||
self.connection = API.ApiClient(self.configuration)
|
||||
self.client = {
|
||||
"index": ENDPOINTS.IndexApi(self.connection),
|
||||
"utils": ENDPOINTS.UtilsApi(self.connection),
|
||||
"search": ENDPOINTS.SearchApi(self.connection),
|
||||
}
|
||||
|
||||
# Create default schema if not exists
|
||||
self.client["utils"].sql(self.schema)
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self.embedding
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
*,
|
||||
batch_size: int = 32,
|
||||
text_ids: Optional[List[str]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""
|
||||
Insert more texts through the embeddings and add to the VectorStore.
|
||||
|
||||
Args:
|
||||
texts: Iterable of strings to add to the VectorStore
|
||||
metadata: Optional column data to be inserted
|
||||
batch_size: Batch size of insertion
|
||||
ids: Optional list of ids to associate with the texts
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the VectorStore.
|
||||
"""
|
||||
# Embed and create the documents
|
||||
ids = text_ids or [
|
||||
# See https://stackoverflow.com/questions/67219691/python-hash-function-that-returns-32-or-64-bits
|
||||
str(int(sha1(t.encode("utf-8")).hexdigest()[:15], 16))
|
||||
for t in texts
|
||||
]
|
||||
transac = []
|
||||
for i, text in enumerate(texts):
|
||||
embed = self.embeddings.embed_query(text)
|
||||
doc_uuid = str(uuid.uuid1())
|
||||
doc = {
|
||||
self.config.column_map["document"]: text,
|
||||
self.config.column_map["embedding"]: embed,
|
||||
self.config.column_map["metadata"]: metadatas[i] if metadatas else {},
|
||||
self.config.column_map["uuid"]: doc_uuid,
|
||||
}
|
||||
transac.append(
|
||||
{"replace": {"index": self.config.table, "id": ids[i], "doc": doc}}
|
||||
)
|
||||
|
||||
if len(transac) == batch_size:
|
||||
body = "\n".join(map(json.dumps, transac))
|
||||
try:
|
||||
self.client["index"].bulk(body)
|
||||
transac = []
|
||||
except Exception as e:
|
||||
logger.info(f"Error indexing documents: {e}")
|
||||
|
||||
if len(transac) > 0:
|
||||
body = "\n".join(map(json.dumps, transac))
|
||||
try:
|
||||
self.client["index"].bulk(body)
|
||||
except Exception as e:
|
||||
logger.info(f"Error indexing documents: {e}")
|
||||
|
||||
return ids
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[ManticoreSearch],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[Dict[Any, Any]]] = None,
|
||||
*,
|
||||
config: Optional[ManticoreSearchSettings] = None,
|
||||
text_ids: Optional[List[str]] = None,
|
||||
batch_size: int = 32,
|
||||
**kwargs: Any,
|
||||
) -> ManticoreSearch:
|
||||
ctx = cls(embedding, config=config, **kwargs)
|
||||
ctx.add_texts(
|
||||
texts=texts,
|
||||
embedding=embedding,
|
||||
text_ids=text_ids,
|
||||
batch_size=batch_size,
|
||||
metadatas=metadatas,
|
||||
**kwargs,
|
||||
)
|
||||
return ctx
|
||||
|
||||
@classmethod
|
||||
def from_documents(
|
||||
cls: Type[ManticoreSearch],
|
||||
documents: List[Document],
|
||||
embedding: Embeddings,
|
||||
*,
|
||||
config: Optional[ManticoreSearchSettings] = None,
|
||||
text_ids: Optional[List[str]] = None,
|
||||
batch_size: int = 32,
|
||||
**kwargs: Any,
|
||||
) -> ManticoreSearch:
|
||||
texts = [doc.page_content for doc in documents]
|
||||
metadatas = [doc.metadata for doc in documents]
|
||||
return cls.from_texts(
|
||||
texts=texts,
|
||||
embedding=embedding,
|
||||
text_ids=text_ids,
|
||||
batch_size=batch_size,
|
||||
metadatas=metadatas,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""
|
||||
Text representation for ManticoreSearch Vector Store, prints backends, username
|
||||
and schemas. Easy to use with `str(ManticoreSearch())`
|
||||
|
||||
Returns:
|
||||
repr: string to show connection info and data schema
|
||||
"""
|
||||
_repr = f"\033[92m\033[1m{self.config.table} @ "
|
||||
_repr += f"http://{self.config.host}:{self.config.port}\033[0m\n\n"
|
||||
_repr += f"\033[1musername: {self.config.username}\033[0m\n\nTable Schema:\n"
|
||||
_repr += "-" * 51 + "\n"
|
||||
for r in self.client["utils"].sql(f"DESCRIBE {self.config.table}")[0]["data"]:
|
||||
_repr += (
|
||||
f"|\033[94m{r['Field']:24s}\033[0m|\033["
|
||||
f"96m{r['Type'] + ' ' + r['Properties']:24s}\033[0m|\n"
|
||||
)
|
||||
_repr += "-" * 51 + "\n"
|
||||
return _repr
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = DEFAULT_K, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Perform a similarity search with ManticoreSearch
|
||||
|
||||
Args:
|
||||
query (str): query string
|
||||
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of Documents
|
||||
"""
|
||||
return self.similarity_search_by_vector(
|
||||
self.embedding.embed_query(query), k, **kwargs
|
||||
)
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_K,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Perform a similarity search with ManticoreSearch by vectors
|
||||
|
||||
Args:
|
||||
embedding (List[float]): Embedding vector
|
||||
k (int, optional): Top K neighbors to retrieve. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List[Document]: List of documents
|
||||
"""
|
||||
|
||||
# Build search request
|
||||
request = {
|
||||
"index": self.config.table,
|
||||
"knn": {
|
||||
"field": self.config.column_map["embedding"],
|
||||
"k": k,
|
||||
"query_vector": embedding,
|
||||
},
|
||||
}
|
||||
|
||||
# Execute request and convert response to langchain.Document format
|
||||
try:
|
||||
return [
|
||||
Document(
|
||||
page_content=r["_source"][self.config.column_map["document"]],
|
||||
metadata=r["_source"][self.config.column_map["metadata"]],
|
||||
)
|
||||
for r in self.client["search"].search(request, **kwargs).hits.hits[:k]
|
||||
]
|
||||
except Exception as e:
|
||||
logger.error(f"\033[91m\033[1m{type(e)}\033[0m \033[95m{str(e)}\033[0m")
|
||||
return []
|
||||
|
||||
def drop(self) -> None:
|
||||
"""
|
||||
Helper function: Drop data
|
||||
"""
|
||||
self.client["utils"].sql(f"DROP TABLE IF EXISTS {self.config.table}")
|
||||
|
||||
@property
|
||||
def metadata_column(self) -> str:
|
||||
return self.config.column_map["metadata"]
|
@@ -50,6 +50,8 @@ EXPECTED_ALL = [
|
||||
"LLMRails",
|
||||
"LanceDB",
|
||||
"Lantern",
|
||||
"ManticoreSearch",
|
||||
"ManticoreSearchSettings",
|
||||
"Marqo",
|
||||
"MatchingEngine",
|
||||
"Meilisearch",
|
||||
@@ -112,6 +114,7 @@ def test_all_imports_exclusive() -> None:
|
||||
"PathwayVectorClient",
|
||||
"DistanceStrategy",
|
||||
"KineticaSettings",
|
||||
"ManticoreSearchSettings",
|
||||
]:
|
||||
assert issubclass(getattr(vectorstores, cls), VectorStore)
|
||||
|
||||
|
@@ -1,106 +0,0 @@
|
||||
"""Test the public API of the tools package."""
|
||||
from langchain_community.vectorstores import __all__ as public_api
|
||||
|
||||
_EXPECTED = [
|
||||
"Aerospike",
|
||||
"AlibabaCloudOpenSearch",
|
||||
"AlibabaCloudOpenSearchSettings",
|
||||
"AnalyticDB",
|
||||
"Annoy",
|
||||
"ApacheDoris",
|
||||
"AtlasDB",
|
||||
"AwaDB",
|
||||
"AzureSearch",
|
||||
"Bagel",
|
||||
"BaiduVectorDB",
|
||||
"BESVectorStore",
|
||||
"BigQueryVectorSearch",
|
||||
"Cassandra",
|
||||
"AstraDB",
|
||||
"Chroma",
|
||||
"Clarifai",
|
||||
"Clickhouse",
|
||||
"ClickhouseSettings",
|
||||
"DashVector",
|
||||
"DatabricksVectorSearch",
|
||||
"DeepLake",
|
||||
"Dingo",
|
||||
"DistanceStrategy",
|
||||
"DocArrayHnswSearch",
|
||||
"DocArrayInMemorySearch",
|
||||
"DocumentDBVectorSearch",
|
||||
"DuckDB",
|
||||
"EcloudESVectorStore",
|
||||
"ElasticKnnSearch",
|
||||
"ElasticVectorSearch",
|
||||
"ElasticsearchStore",
|
||||
"Epsilla",
|
||||
"FAISS",
|
||||
"HanaDB",
|
||||
"Hologres",
|
||||
"InfinispanVS",
|
||||
"InMemoryVectorStore",
|
||||
"KDBAI",
|
||||
"Kinetica",
|
||||
"KineticaSettings",
|
||||
"LanceDB",
|
||||
"Lantern",
|
||||
"LLMRails",
|
||||
"Marqo",
|
||||
"MatchingEngine",
|
||||
"Meilisearch",
|
||||
"Milvus",
|
||||
"MomentoVectorIndex",
|
||||
"MongoDBAtlasVectorSearch",
|
||||
"MyScale",
|
||||
"MyScaleSettings",
|
||||
"Neo4jVector",
|
||||
"OpenSearchVectorSearch",
|
||||
"OracleVS",
|
||||
"PGEmbedding",
|
||||
"PGVector",
|
||||
"PathwayVectorClient",
|
||||
"Pinecone",
|
||||
"Qdrant",
|
||||
"Redis",
|
||||
"Relyt",
|
||||
"Rockset",
|
||||
"SKLearnVectorStore",
|
||||
"ScaNN",
|
||||
"SemaDB",
|
||||
"SingleStoreDB",
|
||||
"SQLiteVSS",
|
||||
"StarRocks",
|
||||
"SupabaseVectorStore",
|
||||
"SurrealDBStore",
|
||||
"Tair",
|
||||
"TiDBVectorStore",
|
||||
"TileDB",
|
||||
"Tigris",
|
||||
"TimescaleVector",
|
||||
"Typesense",
|
||||
"UpstashVectorStore",
|
||||
"USearch",
|
||||
"Vald",
|
||||
"VDMS",
|
||||
"Vearch",
|
||||
"Vectara",
|
||||
"VespaStore",
|
||||
"VLite",
|
||||
"Weaviate",
|
||||
"ZepVectorStore",
|
||||
"Zilliz",
|
||||
"TencentVectorDB",
|
||||
"AzureCosmosDBVectorSearch",
|
||||
"VectorStore",
|
||||
"Yellowbrick",
|
||||
"NeuralDBClientVectorStore",
|
||||
"NeuralDBVectorStore",
|
||||
"CouchbaseVectorStore",
|
||||
]
|
||||
|
||||
|
||||
def test_public_api() -> None:
|
||||
"""Test for regressions or changes in the public API."""
|
||||
# Check that the public API is as expected
|
||||
assert set(public_api) == set(_EXPECTED)
|
Reference in New Issue
Block a user