mirror of
https://github.com/hwchase17/langchain.git
synced 2025-09-30 16:01:30 +00:00
## Description Created a helper method to make vector search indexes via client-side pymongo. **Recent Update** -- Removed error suppressing/overwriting layer in favor of letting the original exception provide information. ## ToDo's - [x] Make _wait_untils for integration test delete index functionalities. - [x] Add documentation for its use. Highlight it's experimental - [x] Post Integration Test Results in a screenshot - [x] Get review from MongoDB internal team (@shaneharvey, @blink1073 , @NoahStapp , @caseyclements) - [x] **Add tests and docs**: If you're adding a new integration, please include 1. Added new integration tests. Not eligible for unit testing since the operation is Atlas Cloud specific. 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory.  - [x] **Lint and test**: Run `make format`, `make lint` and `make test` from the root of the package(s) you've modified. See contribution guidelines for more: https://python.langchain.com/docs/contributing/
91 lines
3.1 KiB
Python
91 lines
3.1 KiB
Python
"""
|
|
Tools for the Maximal Marginal Relevance (MMR) reranking.
|
|
Duplicated from langchain_community to avoid cross-dependencies.
|
|
|
|
Functions "maximal_marginal_relevance" and "cosine_similarity"
|
|
are duplicated in this utility respectively from modules:
|
|
- "libs/community/langchain_community/vectorstores/utils.py"
|
|
- "libs/community/langchain_community/utils/math.py"
|
|
"""
|
|
|
|
import logging
|
|
from typing import List, Union
|
|
|
|
import numpy as np
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
|
|
|
|
|
|
class FailCode:
|
|
INDEX_NOT_FOUND = 27
|
|
INDEX_ALREADY_EXISTS = 68
|
|
|
|
|
|
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
|
|
"""Row-wise cosine similarity between two equal-width matrices."""
|
|
if len(X) == 0 or len(Y) == 0:
|
|
return np.array([])
|
|
|
|
X = np.array(X)
|
|
Y = np.array(Y)
|
|
if X.shape[1] != Y.shape[1]:
|
|
raise ValueError(
|
|
f"Number of columns in X and Y must be the same. X has shape {X.shape} "
|
|
f"and Y has shape {Y.shape}."
|
|
)
|
|
try:
|
|
import simsimd as simd # type: ignore
|
|
|
|
X = np.array(X, dtype=np.float32)
|
|
Y = np.array(Y, dtype=np.float32)
|
|
Z = 1 - np.array(simd.cdist(X, Y, metric="cosine"))
|
|
return Z
|
|
except ImportError:
|
|
logger.debug(
|
|
"Unable to import simsimd, defaulting to NumPy implementation. If you want "
|
|
"to use simsimd please install with `pip install simsimd`."
|
|
)
|
|
X_norm = np.linalg.norm(X, axis=1)
|
|
Y_norm = np.linalg.norm(Y, axis=1)
|
|
# Ignore divide by zero errors run time warnings as those are handled below.
|
|
with np.errstate(divide="ignore", invalid="ignore"):
|
|
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
|
|
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
|
|
return similarity
|
|
|
|
|
|
def maximal_marginal_relevance(
|
|
query_embedding: np.ndarray,
|
|
embedding_list: list,
|
|
lambda_mult: float = 0.5,
|
|
k: int = 4,
|
|
) -> List[int]:
|
|
"""Calculate maximal marginal relevance."""
|
|
if min(k, len(embedding_list)) <= 0:
|
|
return []
|
|
if query_embedding.ndim == 1:
|
|
query_embedding = np.expand_dims(query_embedding, axis=0)
|
|
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
|
|
most_similar = int(np.argmax(similarity_to_query))
|
|
idxs = [most_similar]
|
|
selected = np.array([embedding_list[most_similar]])
|
|
while len(idxs) < min(k, len(embedding_list)):
|
|
best_score = -np.inf
|
|
idx_to_add = -1
|
|
similarity_to_selected = cosine_similarity(embedding_list, selected)
|
|
for i, query_score in enumerate(similarity_to_query):
|
|
if i in idxs:
|
|
continue
|
|
redundant_score = max(similarity_to_selected[i])
|
|
equation_score = (
|
|
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
|
|
)
|
|
if equation_score > best_score:
|
|
best_score = equation_score
|
|
idx_to_add = i
|
|
idxs.append(idx_to_add)
|
|
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
|
|
return idxs
|