mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-25 17:08:46 +00:00
- **Description:** The InMemoryVectorStore is a nice and simple vector store implementation for quick development and debugging. The current implementation is quite limited in its functionalities. This PR extends the functionalities by adding utility function to persist the vector store to a json file and to load it from a json file. We choose the json file format because it allows inspection of the database contents in a text editor, which is great for debugging. Furthermore, it adds a `filter` keyword that can be used to filter out documents on their `page_content` or `metadata`. - **Issue:** - - **Dependencies:** - - **Twitter handle:** @Vincent_Min
233 lines
7.1 KiB
Python
233 lines
7.1 KiB
Python
import json
|
|
import uuid
|
|
from pathlib import Path
|
|
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
|
|
|
|
import numpy as np
|
|
from langchain_core.documents import Document
|
|
from langchain_core.embeddings import Embeddings
|
|
from langchain_core.load import dumpd, load
|
|
from langchain_core.vectorstores import VectorStore
|
|
|
|
from langchain_community.utils.math import cosine_similarity
|
|
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
|
|
|
|
|
class InMemoryVectorStore(VectorStore):
|
|
"""In-memory implementation of VectorStore using a dictionary.
|
|
Uses numpy to compute cosine similarity for search.
|
|
|
|
Args:
|
|
embedding: embedding function to use.
|
|
"""
|
|
|
|
def __init__(self, embedding: Embeddings) -> None:
|
|
self.store: Dict[str, Dict[str, Any]] = {}
|
|
self.embedding = embedding
|
|
|
|
@property
|
|
def embeddings(self) -> Embeddings:
|
|
return self.embedding
|
|
|
|
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
|
if ids:
|
|
for _id in ids:
|
|
self.store.pop(_id, None)
|
|
|
|
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
|
self.delete(ids)
|
|
|
|
def add_texts(
|
|
self,
|
|
texts: Iterable[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
ids: Optional[Sequence[str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""Add texts to the store."""
|
|
vectors = self.embedding.embed_documents(list(texts))
|
|
ids_ = []
|
|
|
|
for i, text in enumerate(texts):
|
|
doc_id = ids[i] if ids else str(uuid.uuid4())
|
|
ids_.append(doc_id)
|
|
self.store[doc_id] = {
|
|
"id": doc_id,
|
|
"vector": vectors[i],
|
|
"text": text,
|
|
"metadata": metadatas[i] if metadatas else {},
|
|
}
|
|
return ids_
|
|
|
|
async def aadd_texts(
|
|
self,
|
|
texts: Iterable[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
return self.add_texts(texts, metadatas, **kwargs)
|
|
|
|
def _similarity_search_with_score_by_vector(
|
|
self,
|
|
embedding: List[float],
|
|
k: int = 4,
|
|
filter: Optional[Callable[[Document], bool]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Tuple[Document, float, List[float]]]:
|
|
result = []
|
|
for doc in self.store.values():
|
|
vector = doc["vector"]
|
|
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
|
result.append(
|
|
(
|
|
Document(page_content=doc["text"], metadata=doc["metadata"]),
|
|
similarity,
|
|
vector,
|
|
)
|
|
)
|
|
result.sort(key=lambda x: x[1], reverse=True)
|
|
if filter is not None:
|
|
result = [r for r in result if filter(r[0])]
|
|
return result[:k]
|
|
|
|
def similarity_search_with_score_by_vector(
|
|
self,
|
|
embedding: List[float],
|
|
k: int = 4,
|
|
filter: Optional[Callable[[Document], bool]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Tuple[Document, float]]:
|
|
return [
|
|
(doc, similarity)
|
|
for doc, similarity, _ in self._similarity_search_with_score_by_vector(
|
|
embedding=embedding, k=k, filter=filter, **kwargs
|
|
)
|
|
]
|
|
|
|
def similarity_search_with_score(
|
|
self,
|
|
query: str,
|
|
k: int = 4,
|
|
**kwargs: Any,
|
|
) -> List[Tuple[Document, float]]:
|
|
embedding = self.embedding.embed_query(query)
|
|
docs = self.similarity_search_with_score_by_vector(
|
|
embedding,
|
|
k,
|
|
**kwargs,
|
|
)
|
|
return docs
|
|
|
|
async def asimilarity_search_with_score(
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
) -> List[Tuple[Document, float]]:
|
|
return self.similarity_search_with_score(query, k, **kwargs)
|
|
|
|
def similarity_search_by_vector(
|
|
self,
|
|
embedding: List[float],
|
|
k: int = 4,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
docs_and_scores = self.similarity_search_with_score_by_vector(
|
|
embedding,
|
|
k,
|
|
**kwargs,
|
|
)
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
async def asimilarity_search_by_vector(
|
|
self, embedding: List[float], k: int = 4, **kwargs: Any
|
|
) -> List[Document]:
|
|
return self.similarity_search_by_vector(embedding, k, **kwargs)
|
|
|
|
def similarity_search(
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
) -> List[Document]:
|
|
return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]
|
|
|
|
async def asimilarity_search(
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
) -> List[Document]:
|
|
return self.similarity_search(query, k, **kwargs)
|
|
|
|
def max_marginal_relevance_search_by_vector(
|
|
self,
|
|
embedding: List[float],
|
|
k: int = 4,
|
|
fetch_k: int = 20,
|
|
lambda_mult: float = 0.5,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
prefetch_hits = self._similarity_search_with_score_by_vector(
|
|
embedding=embedding,
|
|
k=fetch_k,
|
|
**kwargs,
|
|
)
|
|
|
|
mmr_chosen_indices = maximal_marginal_relevance(
|
|
np.array(embedding, dtype=np.float32),
|
|
[vector for _, _, vector in prefetch_hits],
|
|
k=k,
|
|
lambda_mult=lambda_mult,
|
|
)
|
|
return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]
|
|
|
|
def max_marginal_relevance_search(
|
|
self,
|
|
query: str,
|
|
k: int = 4,
|
|
fetch_k: int = 20,
|
|
lambda_mult: float = 0.5,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
embedding_vector = self.embedding.embed_query(query)
|
|
return self.max_marginal_relevance_search_by_vector(
|
|
embedding_vector,
|
|
k,
|
|
fetch_k,
|
|
lambda_mult=lambda_mult,
|
|
**kwargs,
|
|
)
|
|
|
|
@classmethod
|
|
def from_texts(
|
|
cls,
|
|
texts: List[str],
|
|
embedding: Embeddings,
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> "InMemoryVectorStore":
|
|
store = cls(
|
|
embedding=embedding,
|
|
)
|
|
store.add_texts(texts=texts, metadatas=metadatas, **kwargs)
|
|
return store
|
|
|
|
@classmethod
|
|
async def afrom_texts(
|
|
cls,
|
|
texts: List[str],
|
|
embedding: Embeddings,
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> "InMemoryVectorStore":
|
|
return cls.from_texts(texts, embedding, metadatas, **kwargs)
|
|
|
|
@classmethod
|
|
def load(
|
|
cls, path: str, embedding: Embeddings, **kwargs: Any
|
|
) -> "InMemoryVectorStore":
|
|
_path: Path = Path(path)
|
|
with _path.open("r") as f:
|
|
store = load(json.load(f))
|
|
vectorstore = cls(embedding=embedding, **kwargs)
|
|
vectorstore.store = store
|
|
return vectorstore
|
|
|
|
def dump(self, path: str) -> None:
|
|
_path: Path = Path(path)
|
|
_path.parent.mkdir(exist_ok=True, parents=True)
|
|
with _path.open("w") as f:
|
|
json.dump(dumpd(self.store), f, indent=2)
|