mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-05 11:12:47 +00:00
core[minor],community[patch],standard-tests[patch]: Move InMemoryImplementation to langchain-core (#23986)
This PR moves the in memory implementation to langchain-core. * The implementation remains importable from langchain-community. * Supporting utilities are marked as private for now.
This commit is contained in:
parent
aa8c9bb4a9
commit
f765e8fa9d
@ -1,249 +1,5 @@
|
|||||||
import json
|
from langchain_core.vectorstores import InMemoryVectorStore
|
||||||
import uuid
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
|
|
||||||
|
|
||||||
import numpy as np
|
__all__ = [
|
||||||
from langchain_core.documents import Document
|
"InMemoryVectorStore",
|
||||||
from langchain_core.embeddings import Embeddings
|
]
|
||||||
from langchain_core.indexing import UpsertResponse
|
|
||||||
from langchain_core.load import dumpd, load
|
|
||||||
from langchain_core.vectorstores import VectorStore
|
|
||||||
|
|
||||||
from langchain_community.utils.math import cosine_similarity
|
|
||||||
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
|
||||||
|
|
||||||
|
|
||||||
class InMemoryVectorStore(VectorStore):
|
|
||||||
"""In-memory implementation of VectorStore using a dictionary.
|
|
||||||
Uses numpy to compute cosine similarity for search.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
embedding: embedding function to use.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, embedding: Embeddings) -> None:
|
|
||||||
self.store: Dict[str, Dict[str, Any]] = {}
|
|
||||||
self.embedding = embedding
|
|
||||||
|
|
||||||
@property
|
|
||||||
def embeddings(self) -> Embeddings:
|
|
||||||
return self.embedding
|
|
||||||
|
|
||||||
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
|
||||||
if ids:
|
|
||||||
for _id in ids:
|
|
||||||
self.store.pop(_id, None)
|
|
||||||
|
|
||||||
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
|
||||||
self.delete(ids)
|
|
||||||
|
|
||||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
|
||||||
vectors = self.embedding.embed_documents([item.page_content for item in items])
|
|
||||||
ids = []
|
|
||||||
for item, vector in zip(items, vectors):
|
|
||||||
doc_id = item.id if item.id else str(uuid.uuid4())
|
|
||||||
ids.append(doc_id)
|
|
||||||
self.store[doc_id] = {
|
|
||||||
"id": doc_id,
|
|
||||||
"vector": vector,
|
|
||||||
"text": item.page_content,
|
|
||||||
"metadata": item.metadata,
|
|
||||||
}
|
|
||||||
return {
|
|
||||||
"succeeded": ids,
|
|
||||||
"failed": [],
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
|
||||||
"""Get documents by their ids."""
|
|
||||||
documents = []
|
|
||||||
|
|
||||||
for doc_id in ids:
|
|
||||||
doc = self.store.get(doc_id)
|
|
||||||
if doc:
|
|
||||||
documents.append(
|
|
||||||
Document(
|
|
||||||
id=doc["id"],
|
|
||||||
page_content=doc["text"],
|
|
||||||
metadata=doc["metadata"],
|
|
||||||
)
|
|
||||||
)
|
|
||||||
return documents
|
|
||||||
|
|
||||||
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
|
||||||
return self.get_by_ids(ids)
|
|
||||||
|
|
||||||
async def aadd_texts(
|
|
||||||
self,
|
|
||||||
texts: Iterable[str],
|
|
||||||
metadatas: Optional[List[dict]] = None,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> List[str]:
|
|
||||||
return self.add_texts(texts, metadatas, **kwargs)
|
|
||||||
|
|
||||||
def _similarity_search_with_score_by_vector(
|
|
||||||
self,
|
|
||||||
embedding: List[float],
|
|
||||||
k: int = 4,
|
|
||||||
filter: Optional[Callable[[Document], bool]] = None,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> List[Tuple[Document, float, List[float]]]:
|
|
||||||
result = []
|
|
||||||
for doc in self.store.values():
|
|
||||||
vector = doc["vector"]
|
|
||||||
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
|
||||||
result.append(
|
|
||||||
(
|
|
||||||
Document(
|
|
||||||
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
|
|
||||||
),
|
|
||||||
similarity,
|
|
||||||
vector,
|
|
||||||
)
|
|
||||||
)
|
|
||||||
result.sort(key=lambda x: x[1], reverse=True)
|
|
||||||
if filter is not None:
|
|
||||||
result = [r for r in result if filter(r[0])]
|
|
||||||
return result[:k]
|
|
||||||
|
|
||||||
def similarity_search_with_score_by_vector(
|
|
||||||
self,
|
|
||||||
embedding: List[float],
|
|
||||||
k: int = 4,
|
|
||||||
filter: Optional[Callable[[Document], bool]] = None,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> List[Tuple[Document, float]]:
|
|
||||||
return [
|
|
||||||
(doc, similarity)
|
|
||||||
for doc, similarity, _ in self._similarity_search_with_score_by_vector(
|
|
||||||
embedding=embedding, k=k, filter=filter, **kwargs
|
|
||||||
)
|
|
||||||
]
|
|
||||||
|
|
||||||
def similarity_search_with_score(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
k: int = 4,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> List[Tuple[Document, float]]:
|
|
||||||
embedding = self.embedding.embed_query(query)
|
|
||||||
docs = self.similarity_search_with_score_by_vector(
|
|
||||||
embedding,
|
|
||||||
k,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
return docs
|
|
||||||
|
|
||||||
async def asimilarity_search_with_score(
|
|
||||||
self, query: str, k: int = 4, **kwargs: Any
|
|
||||||
) -> List[Tuple[Document, float]]:
|
|
||||||
return self.similarity_search_with_score(query, k, **kwargs)
|
|
||||||
|
|
||||||
def similarity_search_by_vector(
|
|
||||||
self,
|
|
||||||
embedding: List[float],
|
|
||||||
k: int = 4,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> List[Document]:
|
|
||||||
docs_and_scores = self.similarity_search_with_score_by_vector(
|
|
||||||
embedding,
|
|
||||||
k,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
return [doc for doc, _ in docs_and_scores]
|
|
||||||
|
|
||||||
async def asimilarity_search_by_vector(
|
|
||||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
|
||||||
) -> List[Document]:
|
|
||||||
return self.similarity_search_by_vector(embedding, k, **kwargs)
|
|
||||||
|
|
||||||
def similarity_search(
|
|
||||||
self, query: str, k: int = 4, **kwargs: Any
|
|
||||||
) -> List[Document]:
|
|
||||||
return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]
|
|
||||||
|
|
||||||
async def asimilarity_search(
|
|
||||||
self, query: str, k: int = 4, **kwargs: Any
|
|
||||||
) -> List[Document]:
|
|
||||||
return self.similarity_search(query, k, **kwargs)
|
|
||||||
|
|
||||||
def max_marginal_relevance_search_by_vector(
|
|
||||||
self,
|
|
||||||
embedding: List[float],
|
|
||||||
k: int = 4,
|
|
||||||
fetch_k: int = 20,
|
|
||||||
lambda_mult: float = 0.5,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> List[Document]:
|
|
||||||
prefetch_hits = self._similarity_search_with_score_by_vector(
|
|
||||||
embedding=embedding,
|
|
||||||
k=fetch_k,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
mmr_chosen_indices = maximal_marginal_relevance(
|
|
||||||
np.array(embedding, dtype=np.float32),
|
|
||||||
[vector for _, _, vector in prefetch_hits],
|
|
||||||
k=k,
|
|
||||||
lambda_mult=lambda_mult,
|
|
||||||
)
|
|
||||||
return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]
|
|
||||||
|
|
||||||
def max_marginal_relevance_search(
|
|
||||||
self,
|
|
||||||
query: str,
|
|
||||||
k: int = 4,
|
|
||||||
fetch_k: int = 20,
|
|
||||||
lambda_mult: float = 0.5,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> List[Document]:
|
|
||||||
embedding_vector = self.embedding.embed_query(query)
|
|
||||||
return self.max_marginal_relevance_search_by_vector(
|
|
||||||
embedding_vector,
|
|
||||||
k,
|
|
||||||
fetch_k,
|
|
||||||
lambda_mult=lambda_mult,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_texts(
|
|
||||||
cls,
|
|
||||||
texts: List[str],
|
|
||||||
embedding: Embeddings,
|
|
||||||
metadatas: Optional[List[dict]] = None,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> "InMemoryVectorStore":
|
|
||||||
store = cls(
|
|
||||||
embedding=embedding,
|
|
||||||
)
|
|
||||||
store.add_texts(texts=texts, metadatas=metadatas, **kwargs)
|
|
||||||
return store
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
async def afrom_texts(
|
|
||||||
cls,
|
|
||||||
texts: List[str],
|
|
||||||
embedding: Embeddings,
|
|
||||||
metadatas: Optional[List[dict]] = None,
|
|
||||||
**kwargs: Any,
|
|
||||||
) -> "InMemoryVectorStore":
|
|
||||||
return cls.from_texts(texts, embedding, metadatas, **kwargs)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def load(
|
|
||||||
cls, path: str, embedding: Embeddings, **kwargs: Any
|
|
||||||
) -> "InMemoryVectorStore":
|
|
||||||
_path: Path = Path(path)
|
|
||||||
with _path.open("r") as f:
|
|
||||||
store = load(json.load(f))
|
|
||||||
vectorstore = cls(embedding=embedding, **kwargs)
|
|
||||||
vectorstore.store = store
|
|
||||||
return vectorstore
|
|
||||||
|
|
||||||
def dump(self, path: str) -> None:
|
|
||||||
_path: Path = Path(path)
|
|
||||||
_path.parent.mkdir(exist_ok=True, parents=True)
|
|
||||||
with _path.open("w") as f:
|
|
||||||
json.dump(dumpd(self.store), f, indent=2)
|
|
||||||
|
9
libs/core/langchain_core/vectorstores/__init__.py
Normal file
9
libs/core/langchain_core/vectorstores/__init__.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from langchain_core.vectorstores.base import VST, VectorStore, VectorStoreRetriever
|
||||||
|
from langchain_core.vectorstores.in_memory import InMemoryVectorStore
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"VectorStore",
|
||||||
|
"VST",
|
||||||
|
"VectorStoreRetriever",
|
||||||
|
"InMemoryVectorStore",
|
||||||
|
]
|
275
libs/core/langchain_core/vectorstores/in_memory.py
Normal file
275
libs/core/langchain_core/vectorstores/in_memory.py
Normal file
@ -0,0 +1,275 @@
|
|||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import (
|
||||||
|
TYPE_CHECKING,
|
||||||
|
Any,
|
||||||
|
Callable,
|
||||||
|
Dict,
|
||||||
|
Iterable,
|
||||||
|
List,
|
||||||
|
Optional,
|
||||||
|
Sequence,
|
||||||
|
Tuple,
|
||||||
|
)
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.embeddings import Embeddings
|
||||||
|
from langchain_core.load import dumpd, load
|
||||||
|
from langchain_core.vectorstores import VectorStore
|
||||||
|
from langchain_core.vectorstores.utils import _cosine_similarity as cosine_similarity
|
||||||
|
from langchain_core.vectorstores.utils import (
|
||||||
|
_maximal_marginal_relevance as maximal_marginal_relevance,
|
||||||
|
)
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from langchain_core.indexing import UpsertResponse
|
||||||
|
|
||||||
|
|
||||||
|
class InMemoryVectorStore(VectorStore):
|
||||||
|
"""In-memory implementation of VectorStore using a dictionary.
|
||||||
|
|
||||||
|
Uses numpy to compute cosine similarity for search.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
embedding: embedding function to use.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, embedding: Embeddings) -> None:
|
||||||
|
"""Initialize with the given embedding function."""
|
||||||
|
# TODO: would be nice to change to
|
||||||
|
# Dict[str, Document] at some point (will be a breaking change)
|
||||||
|
self.store: Dict[str, Dict[str, Any]] = {}
|
||||||
|
self.embedding = embedding
|
||||||
|
|
||||||
|
@property
|
||||||
|
def embeddings(self) -> Embeddings:
|
||||||
|
return self.embedding
|
||||||
|
|
||||||
|
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||||
|
if ids:
|
||||||
|
for _id in ids:
|
||||||
|
self.store.pop(_id, None)
|
||||||
|
|
||||||
|
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||||
|
self.delete(ids)
|
||||||
|
|
||||||
|
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||||
|
vectors = self.embedding.embed_documents([item.page_content for item in items])
|
||||||
|
ids = []
|
||||||
|
for item, vector in zip(items, vectors):
|
||||||
|
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||||
|
ids.append(doc_id)
|
||||||
|
self.store[doc_id] = {
|
||||||
|
"id": doc_id,
|
||||||
|
"vector": vector,
|
||||||
|
"text": item.page_content,
|
||||||
|
"metadata": item.metadata,
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
"succeeded": ids,
|
||||||
|
"failed": [],
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||||
|
"""Get documents by their ids."""
|
||||||
|
documents = []
|
||||||
|
|
||||||
|
for doc_id in ids:
|
||||||
|
doc = self.store.get(doc_id)
|
||||||
|
if doc:
|
||||||
|
documents.append(
|
||||||
|
Document(
|
||||||
|
id=doc["id"],
|
||||||
|
page_content=doc["text"],
|
||||||
|
metadata=doc["metadata"],
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return documents
|
||||||
|
|
||||||
|
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||||
|
return self.get_by_ids(ids)
|
||||||
|
|
||||||
|
async def aadd_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
return self.add_texts(texts, metadatas, **kwargs)
|
||||||
|
|
||||||
|
def _similarity_search_with_score_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
filter: Optional[Callable[[Document], bool]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float, List[float]]]:
|
||||||
|
result = []
|
||||||
|
for doc in self.store.values():
|
||||||
|
vector = doc["vector"]
|
||||||
|
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
||||||
|
result.append(
|
||||||
|
(
|
||||||
|
Document(
|
||||||
|
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
|
||||||
|
),
|
||||||
|
similarity,
|
||||||
|
vector,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
result.sort(key=lambda x: x[1], reverse=True)
|
||||||
|
if filter is not None:
|
||||||
|
result = [r for r in result if filter(r[0])]
|
||||||
|
return result[:k]
|
||||||
|
|
||||||
|
def similarity_search_with_score_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
filter: Optional[Callable[[Document], bool]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
return [
|
||||||
|
(doc, similarity)
|
||||||
|
for doc, similarity, _ in self._similarity_search_with_score_by_vector(
|
||||||
|
embedding=embedding, k=k, filter=filter, **kwargs
|
||||||
|
)
|
||||||
|
]
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
embedding = self.embedding.embed_query(query)
|
||||||
|
docs = self.similarity_search_with_score_by_vector(
|
||||||
|
embedding,
|
||||||
|
k,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return docs
|
||||||
|
|
||||||
|
async def asimilarity_search_with_score(
|
||||||
|
self, query: str, k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
return self.similarity_search_with_score(query, k, **kwargs)
|
||||||
|
|
||||||
|
def similarity_search_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
docs_and_scores = self.similarity_search_with_score_by_vector(
|
||||||
|
embedding,
|
||||||
|
k,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return [doc for doc, _ in docs_and_scores]
|
||||||
|
|
||||||
|
async def asimilarity_search_by_vector(
|
||||||
|
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Document]:
|
||||||
|
return self.similarity_search_by_vector(embedding, k, **kwargs)
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self, query: str, k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Document]:
|
||||||
|
return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]
|
||||||
|
|
||||||
|
async def asimilarity_search(
|
||||||
|
self, query: str, k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Document]:
|
||||||
|
return self.similarity_search(query, k, **kwargs)
|
||||||
|
|
||||||
|
def max_marginal_relevance_search_by_vector(
|
||||||
|
self,
|
||||||
|
embedding: List[float],
|
||||||
|
k: int = 4,
|
||||||
|
fetch_k: int = 20,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
prefetch_hits = self._similarity_search_with_score_by_vector(
|
||||||
|
embedding=embedding,
|
||||||
|
k=fetch_k,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
import numpy as np
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"numpy must be installed to use max_marginal_relevance_search "
|
||||||
|
"pip install numpy"
|
||||||
|
)
|
||||||
|
|
||||||
|
mmr_chosen_indices = maximal_marginal_relevance(
|
||||||
|
np.array(embedding, dtype=np.float32),
|
||||||
|
[vector for _, _, vector in prefetch_hits],
|
||||||
|
k=k,
|
||||||
|
lambda_mult=lambda_mult,
|
||||||
|
)
|
||||||
|
return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]
|
||||||
|
|
||||||
|
def max_marginal_relevance_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
fetch_k: int = 20,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
embedding_vector = self.embedding.embed_query(query)
|
||||||
|
return self.max_marginal_relevance_search_by_vector(
|
||||||
|
embedding_vector,
|
||||||
|
k,
|
||||||
|
fetch_k,
|
||||||
|
lambda_mult=lambda_mult,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_texts(
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Embeddings,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> "InMemoryVectorStore":
|
||||||
|
store = cls(
|
||||||
|
embedding=embedding,
|
||||||
|
)
|
||||||
|
store.add_texts(texts=texts, metadatas=metadatas, **kwargs)
|
||||||
|
return store
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
async def afrom_texts(
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Embeddings,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> "InMemoryVectorStore":
|
||||||
|
return cls.from_texts(texts, embedding, metadatas, **kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def load(
|
||||||
|
cls, path: str, embedding: Embeddings, **kwargs: Any
|
||||||
|
) -> "InMemoryVectorStore":
|
||||||
|
_path: Path = Path(path)
|
||||||
|
with _path.open("r") as f:
|
||||||
|
store = load(json.load(f))
|
||||||
|
vectorstore = cls(embedding=embedding, **kwargs)
|
||||||
|
vectorstore.store = store
|
||||||
|
return vectorstore
|
||||||
|
|
||||||
|
def dump(self, path: str) -> None:
|
||||||
|
_path: Path = Path(path)
|
||||||
|
_path.parent.mkdir(exist_ok=True, parents=True)
|
||||||
|
with _path.open("w") as f:
|
||||||
|
json.dump(dumpd(self.store), f, indent=2)
|
100
libs/core/langchain_core/vectorstores/utils.py
Normal file
100
libs/core/langchain_core/vectorstores/utils.py
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
"""Internal utilities for the in memory implementation of VectorStore.
|
||||||
|
|
||||||
|
These are part of a private API and users should not used them directly
|
||||||
|
as they can change without notice.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import TYPE_CHECKING, List, Union
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
|
||||||
|
"""Row-wise cosine similarity between two equal-width matrices."""
|
||||||
|
try:
|
||||||
|
import numpy as np
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"cosine_similarity requires numpy to be installed. "
|
||||||
|
"Please install numpy with `pip install numpy`."
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(X) == 0 or len(Y) == 0:
|
||||||
|
return np.array([])
|
||||||
|
|
||||||
|
X = np.array(X)
|
||||||
|
Y = np.array(Y)
|
||||||
|
if X.shape[1] != Y.shape[1]:
|
||||||
|
raise ValueError(
|
||||||
|
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
|
||||||
|
f"and Y has shape {Y.shape}."
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
import simsimd as simd # type: ignore
|
||||||
|
|
||||||
|
X = np.array(X, dtype=np.float32)
|
||||||
|
Y = np.array(Y, dtype=np.float32)
|
||||||
|
Z = 1 - np.array(simd.cdist(X, Y, metric="cosine"))
|
||||||
|
return Z
|
||||||
|
except ImportError:
|
||||||
|
logger.debug(
|
||||||
|
"Unable to import simsimd, defaulting to NumPy implementation. If you want "
|
||||||
|
"to use simsimd please install with `pip install simsimd`."
|
||||||
|
)
|
||||||
|
X_norm = np.linalg.norm(X, axis=1)
|
||||||
|
Y_norm = np.linalg.norm(Y, axis=1)
|
||||||
|
# Ignore divide by zero errors run time warnings as those are handled below.
|
||||||
|
with np.errstate(divide="ignore", invalid="ignore"):
|
||||||
|
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
|
||||||
|
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
|
||||||
|
return similarity
|
||||||
|
|
||||||
|
|
||||||
|
def _maximal_marginal_relevance(
|
||||||
|
query_embedding: np.ndarray,
|
||||||
|
embedding_list: list,
|
||||||
|
lambda_mult: float = 0.5,
|
||||||
|
k: int = 4,
|
||||||
|
) -> List[int]:
|
||||||
|
"""Calculate maximal marginal relevance."""
|
||||||
|
try:
|
||||||
|
import numpy as np
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"maximal_marginal_relevance requires numpy to be installed. "
|
||||||
|
"Please install numpy with `pip install numpy`."
|
||||||
|
)
|
||||||
|
|
||||||
|
if min(k, len(embedding_list)) <= 0:
|
||||||
|
return []
|
||||||
|
if query_embedding.ndim == 1:
|
||||||
|
query_embedding = np.expand_dims(query_embedding, axis=0)
|
||||||
|
similarity_to_query = _cosine_similarity(query_embedding, embedding_list)[0]
|
||||||
|
most_similar = int(np.argmax(similarity_to_query))
|
||||||
|
idxs = [most_similar]
|
||||||
|
selected = np.array([embedding_list[most_similar]])
|
||||||
|
while len(idxs) < min(k, len(embedding_list)):
|
||||||
|
best_score = -np.inf
|
||||||
|
idx_to_add = -1
|
||||||
|
similarity_to_selected = _cosine_similarity(embedding_list, selected)
|
||||||
|
for i, query_score in enumerate(similarity_to_query):
|
||||||
|
if i in idxs:
|
||||||
|
continue
|
||||||
|
redundant_score = max(similarity_to_selected[i])
|
||||||
|
equation_score = (
|
||||||
|
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
|
||||||
|
)
|
||||||
|
if equation_score > best_score:
|
||||||
|
best_score = equation_score
|
||||||
|
idx_to_add = i
|
||||||
|
idxs.append(idx_to_add)
|
||||||
|
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
||||||
|
return idxs
|
23
libs/core/poetry.lock
generated
23
libs/core/poetry.lock
generated
@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "annotated-types"
|
name = "annotated-types"
|
||||||
@ -1197,6 +1197,24 @@ files = [
|
|||||||
{file = "jupyterlab_widgets-3.0.11.tar.gz", hash = "sha256:dd5ac679593c969af29c9bed054c24f26842baa51352114736756bc035deee27"},
|
{file = "jupyterlab_widgets-3.0.11.tar.gz", hash = "sha256:dd5ac679593c969af29c9bed054c24f26842baa51352114736756bc035deee27"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "langchain-standard-tests"
|
||||||
|
version = "0.1.1"
|
||||||
|
description = "Standard tests for LangChain implementations"
|
||||||
|
optional = false
|
||||||
|
python-versions = ">=3.8.1,<4.0"
|
||||||
|
files = []
|
||||||
|
develop = true
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
httpx = "^0.27.0"
|
||||||
|
langchain-core = ">=0.1.40,<0.3"
|
||||||
|
pytest = ">=7,<9"
|
||||||
|
|
||||||
|
[package.source]
|
||||||
|
type = "directory"
|
||||||
|
url = "../standard-tests"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "langchain-text-splitters"
|
name = "langchain-text-splitters"
|
||||||
version = "0.2.2"
|
version = "0.2.2"
|
||||||
@ -2185,7 +2203,6 @@ files = [
|
|||||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
|
||||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||||
@ -3004,4 +3021,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "0a40678314005533ead4fefdbfd8bd27b043641ba99c2211409d8039703ed516"
|
content-hash = "8db47de0615d9a5324dc0e28f6110908e9b16ccfee699aeafef21f68c879e62a"
|
||||||
|
@ -85,6 +85,12 @@ pytest-asyncio = "^0.21.1"
|
|||||||
grandalf = "^0.8"
|
grandalf = "^0.8"
|
||||||
pytest-profiling = "^1.7.0"
|
pytest-profiling = "^1.7.0"
|
||||||
responses = "^0.25.0"
|
responses = "^0.25.0"
|
||||||
|
|
||||||
|
[tool.poetry.group.test.dependencies.langchain-standard-tests]
|
||||||
|
path = "../standard-tests"
|
||||||
|
develop = true
|
||||||
|
|
||||||
|
|
||||||
[[tool.poetry.group.test.dependencies.numpy]]
|
[[tool.poetry.group.test.dependencies.numpy]]
|
||||||
version = "^1.24.0"
|
version = "^1.24.0"
|
||||||
python = "<3.12"
|
python = "<3.12"
|
||||||
|
97
libs/core/tests/unit_tests/vectorstores/test_in_memory.py
Normal file
97
libs/core/tests/unit_tests/vectorstores/test_in_memory.py
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from langchain_standard_tests.integration_tests.vectorstores import (
|
||||||
|
AsyncReadWriteTestSuite,
|
||||||
|
ReadWriteTestSuite,
|
||||||
|
)
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
from langchain_core.embeddings.fake import DeterministicFakeEmbedding
|
||||||
|
from langchain_core.vectorstores import InMemoryVectorStore
|
||||||
|
from tests.unit_tests.stubs import AnyStr
|
||||||
|
|
||||||
|
|
||||||
|
class TestInMemoryReadWriteTestSuite(ReadWriteTestSuite):
|
||||||
|
@pytest.fixture
|
||||||
|
def vectorstore(self) -> InMemoryVectorStore:
|
||||||
|
return InMemoryVectorStore(embedding=self.get_embeddings())
|
||||||
|
|
||||||
|
|
||||||
|
class TestAsyncInMemoryReadWriteTestSuite(AsyncReadWriteTestSuite):
|
||||||
|
@pytest.fixture
|
||||||
|
async def vectorstore(self) -> InMemoryVectorStore:
|
||||||
|
return InMemoryVectorStore(embedding=self.get_embeddings())
|
||||||
|
|
||||||
|
|
||||||
|
async def test_inmemory() -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
store = await InMemoryVectorStore.afrom_texts(
|
||||||
|
["foo", "bar", "baz"], DeterministicFakeEmbedding(size=6)
|
||||||
|
)
|
||||||
|
output = await store.asimilarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo", id=AnyStr())]
|
||||||
|
|
||||||
|
output = await store.asimilarity_search("bar", k=2)
|
||||||
|
assert output == [
|
||||||
|
Document(page_content="bar", id=AnyStr()),
|
||||||
|
Document(page_content="baz", id=AnyStr()),
|
||||||
|
]
|
||||||
|
|
||||||
|
output2 = await store.asimilarity_search_with_score("bar", k=2)
|
||||||
|
assert output2[0][1] > output2[1][1]
|
||||||
|
|
||||||
|
|
||||||
|
async def test_add_by_ids() -> None:
|
||||||
|
vectorstore = InMemoryVectorStore(embedding=DeterministicFakeEmbedding(size=6))
|
||||||
|
|
||||||
|
# Check sync version
|
||||||
|
ids1 = vectorstore.add_texts(["foo", "bar", "baz"], ids=["1", "2", "3"])
|
||||||
|
assert ids1 == ["1", "2", "3"]
|
||||||
|
assert sorted(vectorstore.store.keys()) == ["1", "2", "3"]
|
||||||
|
|
||||||
|
ids2 = await vectorstore.aadd_texts(["foo", "bar", "baz"], ids=["4", "5", "6"])
|
||||||
|
assert ids2 == ["4", "5", "6"]
|
||||||
|
assert sorted(vectorstore.store.keys()) == ["1", "2", "3", "4", "5", "6"]
|
||||||
|
|
||||||
|
|
||||||
|
async def test_inmemory_mmr() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
docsearch = await InMemoryVectorStore.afrom_texts(
|
||||||
|
texts, DeterministicFakeEmbedding(size=6)
|
||||||
|
)
|
||||||
|
# make sure we can k > docstore size
|
||||||
|
output = await docsearch.amax_marginal_relevance_search(
|
||||||
|
"foo", k=10, lambda_mult=0.1
|
||||||
|
)
|
||||||
|
assert len(output) == len(texts)
|
||||||
|
assert output[0] == Document(page_content="foo", id=AnyStr())
|
||||||
|
assert output[1] == Document(page_content="foy", id=AnyStr())
|
||||||
|
|
||||||
|
|
||||||
|
async def test_inmemory_dump_load(tmp_path: Path) -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
embedding = DeterministicFakeEmbedding(size=6)
|
||||||
|
store = await InMemoryVectorStore.afrom_texts(["foo", "bar", "baz"], embedding)
|
||||||
|
output = await store.asimilarity_search("foo", k=1)
|
||||||
|
|
||||||
|
test_file = str(tmp_path / "test.json")
|
||||||
|
store.dump(test_file)
|
||||||
|
|
||||||
|
loaded_store = InMemoryVectorStore.load(test_file, embedding)
|
||||||
|
loaded_output = await loaded_store.asimilarity_search("foo", k=1)
|
||||||
|
|
||||||
|
assert output == loaded_output
|
||||||
|
|
||||||
|
|
||||||
|
async def test_inmemory_filter() -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
store = await InMemoryVectorStore.afrom_texts(
|
||||||
|
["foo", "bar"],
|
||||||
|
DeterministicFakeEmbedding(size=6),
|
||||||
|
[{"id": 1}, {"id": 2}],
|
||||||
|
)
|
||||||
|
output = await store.asimilarity_search(
|
||||||
|
"baz", filter=lambda doc: doc.metadata["id"] == 1
|
||||||
|
)
|
||||||
|
assert output == [Document(page_content="foo", metadata={"id": 1}, id=AnyStr())]
|
@ -1,18 +1,14 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from langchain_core.vectorstores import VectorStore
|
from langchain_core.vectorstores import (
|
||||||
|
InMemoryVectorStore,
|
||||||
|
VectorStore,
|
||||||
|
)
|
||||||
|
|
||||||
from langchain_standard_tests.integration_tests.vectorstores import (
|
from langchain_standard_tests.integration_tests.vectorstores import (
|
||||||
AsyncReadWriteTestSuite,
|
AsyncReadWriteTestSuite,
|
||||||
ReadWriteTestSuite,
|
ReadWriteTestSuite,
|
||||||
)
|
)
|
||||||
|
|
||||||
# We'll need to move this dependency to core
|
|
||||||
pytest.importorskip("langchain_community")
|
|
||||||
|
|
||||||
from langchain_community.vectorstores.inmemory import ( # type: ignore # noqa
|
|
||||||
InMemoryVectorStore,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestInMemoryVectorStore(ReadWriteTestSuite):
|
class TestInMemoryVectorStore(ReadWriteTestSuite):
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
|
Loading…
Reference in New Issue
Block a user