core[minor],community[patch],standard-tests[patch]: Move InMemoryImplementation to langchain-core (#23986)

This PR moves the in memory implementation to langchain-core.

* The implementation remains importable from langchain-community.
* Supporting utilities are marked as private for now.
This commit is contained in:
Eugene Yurtsev
2024-07-08 17:11:51 -04:00
committed by GitHub
parent aa8c9bb4a9
commit f765e8fa9d
9 changed files with 515 additions and 259 deletions

View File

@@ -1,249 +1,5 @@
import json
import uuid
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
from langchain_core.vectorstores import InMemoryVectorStore
import numpy as np
from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.indexing import UpsertResponse
from langchain_core.load import dumpd, load
from langchain_core.vectorstores import VectorStore
from langchain_community.utils.math import cosine_similarity
from langchain_community.vectorstores.utils import maximal_marginal_relevance
class InMemoryVectorStore(VectorStore):
"""In-memory implementation of VectorStore using a dictionary.
Uses numpy to compute cosine similarity for search.
Args:
embedding: embedding function to use.
"""
def __init__(self, embedding: Embeddings) -> None:
self.store: Dict[str, Dict[str, Any]] = {}
self.embedding = embedding
@property
def embeddings(self) -> Embeddings:
return self.embedding
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
if ids:
for _id in ids:
self.store.pop(_id, None)
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
self.delete(ids)
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
vectors = self.embedding.embed_documents([item.page_content for item in items])
ids = []
for item, vector in zip(items, vectors):
doc_id = item.id if item.id else str(uuid.uuid4())
ids.append(doc_id)
self.store[doc_id] = {
"id": doc_id,
"vector": vector,
"text": item.page_content,
"metadata": item.metadata,
}
return {
"succeeded": ids,
"failed": [],
}
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
"""Get documents by their ids."""
documents = []
for doc_id in ids:
doc = self.store.get(doc_id)
if doc:
documents.append(
Document(
id=doc["id"],
page_content=doc["text"],
metadata=doc["metadata"],
)
)
return documents
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
return self.get_by_ids(ids)
async def aadd_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
return self.add_texts(texts, metadatas, **kwargs)
def _similarity_search_with_score_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[Callable[[Document], bool]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float, List[float]]]:
result = []
for doc in self.store.values():
vector = doc["vector"]
similarity = float(cosine_similarity([embedding], [vector]).item(0))
result.append(
(
Document(
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
),
similarity,
vector,
)
)
result.sort(key=lambda x: x[1], reverse=True)
if filter is not None:
result = [r for r in result if filter(r[0])]
return result[:k]
def similarity_search_with_score_by_vector(
self,
embedding: List[float],
k: int = 4,
filter: Optional[Callable[[Document], bool]] = None,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
return [
(doc, similarity)
for doc, similarity, _ in self._similarity_search_with_score_by_vector(
embedding=embedding, k=k, filter=filter, **kwargs
)
]
def similarity_search_with_score(
self,
query: str,
k: int = 4,
**kwargs: Any,
) -> List[Tuple[Document, float]]:
embedding = self.embedding.embed_query(query)
docs = self.similarity_search_with_score_by_vector(
embedding,
k,
**kwargs,
)
return docs
async def asimilarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
return self.similarity_search_with_score(query, k, **kwargs)
def similarity_search_by_vector(
self,
embedding: List[float],
k: int = 4,
**kwargs: Any,
) -> List[Document]:
docs_and_scores = self.similarity_search_with_score_by_vector(
embedding,
k,
**kwargs,
)
return [doc for doc, _ in docs_and_scores]
async def asimilarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
return self.similarity_search_by_vector(embedding, k, **kwargs)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]
async def asimilarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
return self.similarity_search(query, k, **kwargs)
def max_marginal_relevance_search_by_vector(
self,
embedding: List[float],
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
prefetch_hits = self._similarity_search_with_score_by_vector(
embedding=embedding,
k=fetch_k,
**kwargs,
)
mmr_chosen_indices = maximal_marginal_relevance(
np.array(embedding, dtype=np.float32),
[vector for _, _, vector in prefetch_hits],
k=k,
lambda_mult=lambda_mult,
)
return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]
def max_marginal_relevance_search(
self,
query: str,
k: int = 4,
fetch_k: int = 20,
lambda_mult: float = 0.5,
**kwargs: Any,
) -> List[Document]:
embedding_vector = self.embedding.embed_query(query)
return self.max_marginal_relevance_search_by_vector(
embedding_vector,
k,
fetch_k,
lambda_mult=lambda_mult,
**kwargs,
)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "InMemoryVectorStore":
store = cls(
embedding=embedding,
)
store.add_texts(texts=texts, metadatas=metadatas, **kwargs)
return store
@classmethod
async def afrom_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> "InMemoryVectorStore":
return cls.from_texts(texts, embedding, metadatas, **kwargs)
@classmethod
def load(
cls, path: str, embedding: Embeddings, **kwargs: Any
) -> "InMemoryVectorStore":
_path: Path = Path(path)
with _path.open("r") as f:
store = load(json.load(f))
vectorstore = cls(embedding=embedding, **kwargs)
vectorstore.store = store
return vectorstore
def dump(self, path: str) -> None:
_path: Path = Path(path)
_path.parent.mkdir(exist_ok=True, parents=True)
with _path.open("w") as f:
json.dump(dumpd(self.store), f, indent=2)
__all__ = [
"InMemoryVectorStore",
]