mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-31 00:29:57 +00:00
core[minor],community[patch],standard-tests[patch]: Move InMemoryImplementation to langchain-core (#23986)
This PR moves the in memory implementation to langchain-core. * The implementation remains importable from langchain-community. * Supporting utilities are marked as private for now.
This commit is contained in:
parent
aa8c9bb4a9
commit
f765e8fa9d
@ -1,249 +1,5 @@
|
||||
import json
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple
|
||||
from langchain_core.vectorstores import InMemoryVectorStore
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
from langchain_core.load import dumpd, load
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
|
||||
from langchain_community.utils.math import cosine_similarity
|
||||
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
||||
|
||||
|
||||
class InMemoryVectorStore(VectorStore):
|
||||
"""In-memory implementation of VectorStore using a dictionary.
|
||||
Uses numpy to compute cosine similarity for search.
|
||||
|
||||
Args:
|
||||
embedding: embedding function to use.
|
||||
"""
|
||||
|
||||
def __init__(self, embedding: Embeddings) -> None:
|
||||
self.store: Dict[str, Dict[str, Any]] = {}
|
||||
self.embedding = embedding
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self.embedding
|
||||
|
||||
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||
if ids:
|
||||
for _id in ids:
|
||||
self.store.pop(_id, None)
|
||||
|
||||
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||
self.delete(ids)
|
||||
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
vectors = self.embedding.embed_documents([item.page_content for item in items])
|
||||
ids = []
|
||||
for item, vector in zip(items, vectors):
|
||||
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||
ids.append(doc_id)
|
||||
self.store[doc_id] = {
|
||||
"id": doc_id,
|
||||
"vector": vector,
|
||||
"text": item.page_content,
|
||||
"metadata": item.metadata,
|
||||
}
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
"""Get documents by their ids."""
|
||||
documents = []
|
||||
|
||||
for doc_id in ids:
|
||||
doc = self.store.get(doc_id)
|
||||
if doc:
|
||||
documents.append(
|
||||
Document(
|
||||
id=doc["id"],
|
||||
page_content=doc["text"],
|
||||
metadata=doc["metadata"],
|
||||
)
|
||||
)
|
||||
return documents
|
||||
|
||||
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
return self.get_by_ids(ids)
|
||||
|
||||
async def aadd_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
return self.add_texts(texts, metadatas, **kwargs)
|
||||
|
||||
def _similarity_search_with_score_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[Callable[[Document], bool]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float, List[float]]]:
|
||||
result = []
|
||||
for doc in self.store.values():
|
||||
vector = doc["vector"]
|
||||
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
||||
result.append(
|
||||
(
|
||||
Document(
|
||||
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
|
||||
),
|
||||
similarity,
|
||||
vector,
|
||||
)
|
||||
)
|
||||
result.sort(key=lambda x: x[1], reverse=True)
|
||||
if filter is not None:
|
||||
result = [r for r in result if filter(r[0])]
|
||||
return result[:k]
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[Callable[[Document], bool]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
return [
|
||||
(doc, similarity)
|
||||
for doc, similarity, _ in self._similarity_search_with_score_by_vector(
|
||||
embedding=embedding, k=k, filter=filter, **kwargs
|
||||
)
|
||||
]
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
embedding = self.embedding.embed_query(query)
|
||||
docs = self.similarity_search_with_score_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
**kwargs,
|
||||
)
|
||||
return docs
|
||||
|
||||
async def asimilarity_search_with_score(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Tuple[Document, float]]:
|
||||
return self.similarity_search_with_score(query, k, **kwargs)
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
docs_and_scores = self.similarity_search_with_score_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
**kwargs,
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
async def asimilarity_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
return self.similarity_search_by_vector(embedding, k, **kwargs)
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]
|
||||
|
||||
async def asimilarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
return self.similarity_search(query, k, **kwargs)
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
prefetch_hits = self._similarity_search_with_score_by_vector(
|
||||
embedding=embedding,
|
||||
k=fetch_k,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
mmr_chosen_indices = maximal_marginal_relevance(
|
||||
np.array(embedding, dtype=np.float32),
|
||||
[vector for _, _, vector in prefetch_hits],
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
embedding_vector = self.embedding.embed_query(query)
|
||||
return self.max_marginal_relevance_search_by_vector(
|
||||
embedding_vector,
|
||||
k,
|
||||
fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> "InMemoryVectorStore":
|
||||
store = cls(
|
||||
embedding=embedding,
|
||||
)
|
||||
store.add_texts(texts=texts, metadatas=metadatas, **kwargs)
|
||||
return store
|
||||
|
||||
@classmethod
|
||||
async def afrom_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> "InMemoryVectorStore":
|
||||
return cls.from_texts(texts, embedding, metadatas, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def load(
|
||||
cls, path: str, embedding: Embeddings, **kwargs: Any
|
||||
) -> "InMemoryVectorStore":
|
||||
_path: Path = Path(path)
|
||||
with _path.open("r") as f:
|
||||
store = load(json.load(f))
|
||||
vectorstore = cls(embedding=embedding, **kwargs)
|
||||
vectorstore.store = store
|
||||
return vectorstore
|
||||
|
||||
def dump(self, path: str) -> None:
|
||||
_path: Path = Path(path)
|
||||
_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
with _path.open("w") as f:
|
||||
json.dump(dumpd(self.store), f, indent=2)
|
||||
__all__ = [
|
||||
"InMemoryVectorStore",
|
||||
]
|
||||
|
9
libs/core/langchain_core/vectorstores/__init__.py
Normal file
9
libs/core/langchain_core/vectorstores/__init__.py
Normal file
@ -0,0 +1,9 @@
|
||||
from langchain_core.vectorstores.base import VST, VectorStore, VectorStoreRetriever
|
||||
from langchain_core.vectorstores.in_memory import InMemoryVectorStore
|
||||
|
||||
__all__ = [
|
||||
"VectorStore",
|
||||
"VST",
|
||||
"VectorStoreRetriever",
|
||||
"InMemoryVectorStore",
|
||||
]
|
275
libs/core/langchain_core/vectorstores/in_memory.py
Normal file
275
libs/core/langchain_core/vectorstores/in_memory.py
Normal file
@ -0,0 +1,275 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
from typing import (
|
||||
TYPE_CHECKING,
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Sequence,
|
||||
Tuple,
|
||||
)
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.load import dumpd, load
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
from langchain_core.vectorstores.utils import _cosine_similarity as cosine_similarity
|
||||
from langchain_core.vectorstores.utils import (
|
||||
_maximal_marginal_relevance as maximal_marginal_relevance,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from langchain_core.indexing import UpsertResponse
|
||||
|
||||
|
||||
class InMemoryVectorStore(VectorStore):
|
||||
"""In-memory implementation of VectorStore using a dictionary.
|
||||
|
||||
Uses numpy to compute cosine similarity for search.
|
||||
|
||||
Args:
|
||||
embedding: embedding function to use.
|
||||
"""
|
||||
|
||||
def __init__(self, embedding: Embeddings) -> None:
|
||||
"""Initialize with the given embedding function."""
|
||||
# TODO: would be nice to change to
|
||||
# Dict[str, Document] at some point (will be a breaking change)
|
||||
self.store: Dict[str, Dict[str, Any]] = {}
|
||||
self.embedding = embedding
|
||||
|
||||
@property
|
||||
def embeddings(self) -> Embeddings:
|
||||
return self.embedding
|
||||
|
||||
def delete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||
if ids:
|
||||
for _id in ids:
|
||||
self.store.pop(_id, None)
|
||||
|
||||
async def adelete(self, ids: Optional[Sequence[str]] = None, **kwargs: Any) -> None:
|
||||
self.delete(ids)
|
||||
|
||||
def upsert(self, items: Sequence[Document], /, **kwargs: Any) -> UpsertResponse:
|
||||
vectors = self.embedding.embed_documents([item.page_content for item in items])
|
||||
ids = []
|
||||
for item, vector in zip(items, vectors):
|
||||
doc_id = item.id if item.id else str(uuid.uuid4())
|
||||
ids.append(doc_id)
|
||||
self.store[doc_id] = {
|
||||
"id": doc_id,
|
||||
"vector": vector,
|
||||
"text": item.page_content,
|
||||
"metadata": item.metadata,
|
||||
}
|
||||
return {
|
||||
"succeeded": ids,
|
||||
"failed": [],
|
||||
}
|
||||
|
||||
def get_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
"""Get documents by their ids."""
|
||||
documents = []
|
||||
|
||||
for doc_id in ids:
|
||||
doc = self.store.get(doc_id)
|
||||
if doc:
|
||||
documents.append(
|
||||
Document(
|
||||
id=doc["id"],
|
||||
page_content=doc["text"],
|
||||
metadata=doc["metadata"],
|
||||
)
|
||||
)
|
||||
return documents
|
||||
|
||||
async def aget_by_ids(self, ids: Sequence[str], /) -> List[Document]:
|
||||
return self.get_by_ids(ids)
|
||||
|
||||
async def aadd_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
return self.add_texts(texts, metadatas, **kwargs)
|
||||
|
||||
def _similarity_search_with_score_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[Callable[[Document], bool]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float, List[float]]]:
|
||||
result = []
|
||||
for doc in self.store.values():
|
||||
vector = doc["vector"]
|
||||
similarity = float(cosine_similarity([embedding], [vector]).item(0))
|
||||
result.append(
|
||||
(
|
||||
Document(
|
||||
id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
|
||||
),
|
||||
similarity,
|
||||
vector,
|
||||
)
|
||||
)
|
||||
result.sort(key=lambda x: x[1], reverse=True)
|
||||
if filter is not None:
|
||||
result = [r for r in result if filter(r[0])]
|
||||
return result[:k]
|
||||
|
||||
def similarity_search_with_score_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[Callable[[Document], bool]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
return [
|
||||
(doc, similarity)
|
||||
for doc, similarity, _ in self._similarity_search_with_score_by_vector(
|
||||
embedding=embedding, k=k, filter=filter, **kwargs
|
||||
)
|
||||
]
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
embedding = self.embedding.embed_query(query)
|
||||
docs = self.similarity_search_with_score_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
**kwargs,
|
||||
)
|
||||
return docs
|
||||
|
||||
async def asimilarity_search_with_score(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Tuple[Document, float]]:
|
||||
return self.similarity_search_with_score(query, k, **kwargs)
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
docs_and_scores = self.similarity_search_with_score_by_vector(
|
||||
embedding,
|
||||
k,
|
||||
**kwargs,
|
||||
)
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
async def asimilarity_search_by_vector(
|
||||
self, embedding: List[float], k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
return self.similarity_search_by_vector(embedding, k, **kwargs)
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
return [doc for doc, _ in self.similarity_search_with_score(query, k, **kwargs)]
|
||||
|
||||
async def asimilarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
return self.similarity_search(query, k, **kwargs)
|
||||
|
||||
def max_marginal_relevance_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
prefetch_hits = self._similarity_search_with_score_by_vector(
|
||||
embedding=embedding,
|
||||
k=fetch_k,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"numpy must be installed to use max_marginal_relevance_search "
|
||||
"pip install numpy"
|
||||
)
|
||||
|
||||
mmr_chosen_indices = maximal_marginal_relevance(
|
||||
np.array(embedding, dtype=np.float32),
|
||||
[vector for _, _, vector in prefetch_hits],
|
||||
k=k,
|
||||
lambda_mult=lambda_mult,
|
||||
)
|
||||
return [prefetch_hits[idx][0] for idx in mmr_chosen_indices]
|
||||
|
||||
def max_marginal_relevance_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
fetch_k: int = 20,
|
||||
lambda_mult: float = 0.5,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
embedding_vector = self.embedding.embed_query(query)
|
||||
return self.max_marginal_relevance_search_by_vector(
|
||||
embedding_vector,
|
||||
k,
|
||||
fetch_k,
|
||||
lambda_mult=lambda_mult,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> "InMemoryVectorStore":
|
||||
store = cls(
|
||||
embedding=embedding,
|
||||
)
|
||||
store.add_texts(texts=texts, metadatas=metadatas, **kwargs)
|
||||
return store
|
||||
|
||||
@classmethod
|
||||
async def afrom_texts(
|
||||
cls,
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> "InMemoryVectorStore":
|
||||
return cls.from_texts(texts, embedding, metadatas, **kwargs)
|
||||
|
||||
@classmethod
|
||||
def load(
|
||||
cls, path: str, embedding: Embeddings, **kwargs: Any
|
||||
) -> "InMemoryVectorStore":
|
||||
_path: Path = Path(path)
|
||||
with _path.open("r") as f:
|
||||
store = load(json.load(f))
|
||||
vectorstore = cls(embedding=embedding, **kwargs)
|
||||
vectorstore.store = store
|
||||
return vectorstore
|
||||
|
||||
def dump(self, path: str) -> None:
|
||||
_path: Path = Path(path)
|
||||
_path.parent.mkdir(exist_ok=True, parents=True)
|
||||
with _path.open("w") as f:
|
||||
json.dump(dumpd(self.store), f, indent=2)
|
100
libs/core/langchain_core/vectorstores/utils.py
Normal file
100
libs/core/langchain_core/vectorstores/utils.py
Normal file
@ -0,0 +1,100 @@
|
||||
"""Internal utilities for the in memory implementation of VectorStore.
|
||||
|
||||
These are part of a private API and users should not used them directly
|
||||
as they can change without notice.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, List, Union
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import numpy as np
|
||||
|
||||
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
|
||||
"""Row-wise cosine similarity between two equal-width matrices."""
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"cosine_similarity requires numpy to be installed. "
|
||||
"Please install numpy with `pip install numpy`."
|
||||
)
|
||||
|
||||
if len(X) == 0 or len(Y) == 0:
|
||||
return np.array([])
|
||||
|
||||
X = np.array(X)
|
||||
Y = np.array(Y)
|
||||
if X.shape[1] != Y.shape[1]:
|
||||
raise ValueError(
|
||||
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
|
||||
f"and Y has shape {Y.shape}."
|
||||
)
|
||||
try:
|
||||
import simsimd as simd # type: ignore
|
||||
|
||||
X = np.array(X, dtype=np.float32)
|
||||
Y = np.array(Y, dtype=np.float32)
|
||||
Z = 1 - np.array(simd.cdist(X, Y, metric="cosine"))
|
||||
return Z
|
||||
except ImportError:
|
||||
logger.debug(
|
||||
"Unable to import simsimd, defaulting to NumPy implementation. If you want "
|
||||
"to use simsimd please install with `pip install simsimd`."
|
||||
)
|
||||
X_norm = np.linalg.norm(X, axis=1)
|
||||
Y_norm = np.linalg.norm(Y, axis=1)
|
||||
# Ignore divide by zero errors run time warnings as those are handled below.
|
||||
with np.errstate(divide="ignore", invalid="ignore"):
|
||||
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
|
||||
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
|
||||
return similarity
|
||||
|
||||
|
||||
def _maximal_marginal_relevance(
|
||||
query_embedding: np.ndarray,
|
||||
embedding_list: list,
|
||||
lambda_mult: float = 0.5,
|
||||
k: int = 4,
|
||||
) -> List[int]:
|
||||
"""Calculate maximal marginal relevance."""
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"maximal_marginal_relevance requires numpy to be installed. "
|
||||
"Please install numpy with `pip install numpy`."
|
||||
)
|
||||
|
||||
if min(k, len(embedding_list)) <= 0:
|
||||
return []
|
||||
if query_embedding.ndim == 1:
|
||||
query_embedding = np.expand_dims(query_embedding, axis=0)
|
||||
similarity_to_query = _cosine_similarity(query_embedding, embedding_list)[0]
|
||||
most_similar = int(np.argmax(similarity_to_query))
|
||||
idxs = [most_similar]
|
||||
selected = np.array([embedding_list[most_similar]])
|
||||
while len(idxs) < min(k, len(embedding_list)):
|
||||
best_score = -np.inf
|
||||
idx_to_add = -1
|
||||
similarity_to_selected = _cosine_similarity(embedding_list, selected)
|
||||
for i, query_score in enumerate(similarity_to_query):
|
||||
if i in idxs:
|
||||
continue
|
||||
redundant_score = max(similarity_to_selected[i])
|
||||
equation_score = (
|
||||
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
|
||||
)
|
||||
if equation_score > best_score:
|
||||
best_score = equation_score
|
||||
idx_to_add = i
|
||||
idxs.append(idx_to_add)
|
||||
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
||||
return idxs
|
23
libs/core/poetry.lock
generated
23
libs/core/poetry.lock
generated
@ -1,4 +1,4 @@
|
||||
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
||||
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
|
||||
|
||||
[[package]]
|
||||
name = "annotated-types"
|
||||
@ -1197,6 +1197,24 @@ files = [
|
||||
{file = "jupyterlab_widgets-3.0.11.tar.gz", hash = "sha256:dd5ac679593c969af29c9bed054c24f26842baa51352114736756bc035deee27"},
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "langchain-standard-tests"
|
||||
version = "0.1.1"
|
||||
description = "Standard tests for LangChain implementations"
|
||||
optional = false
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
files = []
|
||||
develop = true
|
||||
|
||||
[package.dependencies]
|
||||
httpx = "^0.27.0"
|
||||
langchain-core = ">=0.1.40,<0.3"
|
||||
pytest = ">=7,<9"
|
||||
|
||||
[package.source]
|
||||
type = "directory"
|
||||
url = "../standard-tests"
|
||||
|
||||
[[package]]
|
||||
name = "langchain-text-splitters"
|
||||
version = "0.2.2"
|
||||
@ -2185,7 +2203,6 @@ files = [
|
||||
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
|
||||
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
|
||||
@ -3004,4 +3021,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools",
|
||||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = ">=3.8.1,<4.0"
|
||||
content-hash = "0a40678314005533ead4fefdbfd8bd27b043641ba99c2211409d8039703ed516"
|
||||
content-hash = "8db47de0615d9a5324dc0e28f6110908e9b16ccfee699aeafef21f68c879e62a"
|
||||
|
@ -85,6 +85,12 @@ pytest-asyncio = "^0.21.1"
|
||||
grandalf = "^0.8"
|
||||
pytest-profiling = "^1.7.0"
|
||||
responses = "^0.25.0"
|
||||
|
||||
[tool.poetry.group.test.dependencies.langchain-standard-tests]
|
||||
path = "../standard-tests"
|
||||
develop = true
|
||||
|
||||
|
||||
[[tool.poetry.group.test.dependencies.numpy]]
|
||||
version = "^1.24.0"
|
||||
python = "<3.12"
|
||||
|
97
libs/core/tests/unit_tests/vectorstores/test_in_memory.py
Normal file
97
libs/core/tests/unit_tests/vectorstores/test_in_memory.py
Normal file
@ -0,0 +1,97 @@
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from langchain_standard_tests.integration_tests.vectorstores import (
|
||||
AsyncReadWriteTestSuite,
|
||||
ReadWriteTestSuite,
|
||||
)
|
||||
|
||||
from langchain_core.documents import Document
|
||||
from langchain_core.embeddings.fake import DeterministicFakeEmbedding
|
||||
from langchain_core.vectorstores import InMemoryVectorStore
|
||||
from tests.unit_tests.stubs import AnyStr
|
||||
|
||||
|
||||
class TestInMemoryReadWriteTestSuite(ReadWriteTestSuite):
|
||||
@pytest.fixture
|
||||
def vectorstore(self) -> InMemoryVectorStore:
|
||||
return InMemoryVectorStore(embedding=self.get_embeddings())
|
||||
|
||||
|
||||
class TestAsyncInMemoryReadWriteTestSuite(AsyncReadWriteTestSuite):
|
||||
@pytest.fixture
|
||||
async def vectorstore(self) -> InMemoryVectorStore:
|
||||
return InMemoryVectorStore(embedding=self.get_embeddings())
|
||||
|
||||
|
||||
async def test_inmemory() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
store = await InMemoryVectorStore.afrom_texts(
|
||||
["foo", "bar", "baz"], DeterministicFakeEmbedding(size=6)
|
||||
)
|
||||
output = await store.asimilarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", id=AnyStr())]
|
||||
|
||||
output = await store.asimilarity_search("bar", k=2)
|
||||
assert output == [
|
||||
Document(page_content="bar", id=AnyStr()),
|
||||
Document(page_content="baz", id=AnyStr()),
|
||||
]
|
||||
|
||||
output2 = await store.asimilarity_search_with_score("bar", k=2)
|
||||
assert output2[0][1] > output2[1][1]
|
||||
|
||||
|
||||
async def test_add_by_ids() -> None:
|
||||
vectorstore = InMemoryVectorStore(embedding=DeterministicFakeEmbedding(size=6))
|
||||
|
||||
# Check sync version
|
||||
ids1 = vectorstore.add_texts(["foo", "bar", "baz"], ids=["1", "2", "3"])
|
||||
assert ids1 == ["1", "2", "3"]
|
||||
assert sorted(vectorstore.store.keys()) == ["1", "2", "3"]
|
||||
|
||||
ids2 = await vectorstore.aadd_texts(["foo", "bar", "baz"], ids=["4", "5", "6"])
|
||||
assert ids2 == ["4", "5", "6"]
|
||||
assert sorted(vectorstore.store.keys()) == ["1", "2", "3", "4", "5", "6"]
|
||||
|
||||
|
||||
async def test_inmemory_mmr() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
docsearch = await InMemoryVectorStore.afrom_texts(
|
||||
texts, DeterministicFakeEmbedding(size=6)
|
||||
)
|
||||
# make sure we can k > docstore size
|
||||
output = await docsearch.amax_marginal_relevance_search(
|
||||
"foo", k=10, lambda_mult=0.1
|
||||
)
|
||||
assert len(output) == len(texts)
|
||||
assert output[0] == Document(page_content="foo", id=AnyStr())
|
||||
assert output[1] == Document(page_content="foy", id=AnyStr())
|
||||
|
||||
|
||||
async def test_inmemory_dump_load(tmp_path: Path) -> None:
|
||||
"""Test end to end construction and search."""
|
||||
embedding = DeterministicFakeEmbedding(size=6)
|
||||
store = await InMemoryVectorStore.afrom_texts(["foo", "bar", "baz"], embedding)
|
||||
output = await store.asimilarity_search("foo", k=1)
|
||||
|
||||
test_file = str(tmp_path / "test.json")
|
||||
store.dump(test_file)
|
||||
|
||||
loaded_store = InMemoryVectorStore.load(test_file, embedding)
|
||||
loaded_output = await loaded_store.asimilarity_search("foo", k=1)
|
||||
|
||||
assert output == loaded_output
|
||||
|
||||
|
||||
async def test_inmemory_filter() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
store = await InMemoryVectorStore.afrom_texts(
|
||||
["foo", "bar"],
|
||||
DeterministicFakeEmbedding(size=6),
|
||||
[{"id": 1}, {"id": 2}],
|
||||
)
|
||||
output = await store.asimilarity_search(
|
||||
"baz", filter=lambda doc: doc.metadata["id"] == 1
|
||||
)
|
||||
assert output == [Document(page_content="foo", metadata={"id": 1}, id=AnyStr())]
|
@ -1,18 +1,14 @@
|
||||
import pytest
|
||||
from langchain_core.vectorstores import VectorStore
|
||||
from langchain_core.vectorstores import (
|
||||
InMemoryVectorStore,
|
||||
VectorStore,
|
||||
)
|
||||
|
||||
from langchain_standard_tests.integration_tests.vectorstores import (
|
||||
AsyncReadWriteTestSuite,
|
||||
ReadWriteTestSuite,
|
||||
)
|
||||
|
||||
# We'll need to move this dependency to core
|
||||
pytest.importorskip("langchain_community")
|
||||
|
||||
from langchain_community.vectorstores.inmemory import ( # type: ignore # noqa
|
||||
InMemoryVectorStore,
|
||||
)
|
||||
|
||||
|
||||
class TestInMemoryVectorStore(ReadWriteTestSuite):
|
||||
@pytest.fixture
|
||||
|
Loading…
Reference in New Issue
Block a user