Files
langchain/libs/partners/databricks/langchain_databricks/utils.py
Yuki Watanabe c7a8af2e75 databricks: add vector search and embeddings (#25648)
### Summary

Add `DatabricksVectorSearch` and `DatabricksEmbeddings` classes to the
`langchain-databricks` partner packages. Core functionality is
unchanged, but the vector search class is largely refactored for
readability and maintainability.

This PR does not add integration tests yet. This will be added once the
Databricks test workspace is ready.

Tagging @efriis as POC


### Tracker
[] Create a package and imgrate ChatDatabricks
[✍️] Migrate DatabricksVectorSearch, DatabricksEmbeddings, and their
docs
~[ ] Migrate UCFunctionToolkit and its doc~
[ ] Add provider document and update README.md
[ ] Add integration tests and set up secrets (after moved to an external
package)
[ ] Add deprecation note to the community implementations.

---------

Signed-off-by: B-Step62 <yuki.watanabe@databricks.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
2024-08-24 00:40:21 +00:00

102 lines
3.5 KiB
Python

from typing import Any, List, Union
from urllib.parse import urlparse
import numpy as np
def get_deployment_client(target_uri: str) -> Any:
if (target_uri != "databricks") and (urlparse(target_uri).scheme != "databricks"):
raise ValueError(
"Invalid target URI. The target URI must be a valid databricks URI."
)
try:
from mlflow.deployments import get_deploy_client # type: ignore[import-untyped]
return get_deploy_client(target_uri)
except ImportError as e:
raise ImportError(
"Failed to create the client. "
"Please run `pip install mlflow` to install "
"required dependencies."
) from e
# Utility function for Maximal Marginal Relevance (MMR) reranking.
# Copied from langchain_community/vectorstores/utils.py to avoid cross-dependency
Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray]
def maximal_marginal_relevance(
query_embedding: np.ndarray,
embedding_list: list,
lambda_mult: float = 0.5,
k: int = 4,
) -> List[int]:
"""Calculate maximal marginal relevance.
Args:
query_embedding: Query embedding.
embedding_list: List of embeddings to select from.
lambda_mult: Number between 0 and 1 that determines the degree
of diversity among the results with 0 corresponding
to maximum diversity and 1 to minimum diversity.
Defaults to 0.5.
k: Number of Documents to return. Defaults to 4.
Returns:
List of indices of embeddings selected by maximal marginal relevance.
"""
if min(k, len(embedding_list)) <= 0:
return []
if query_embedding.ndim == 1:
query_embedding = np.expand_dims(query_embedding, axis=0)
similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0]
most_similar = int(np.argmax(similarity_to_query))
idxs = [most_similar]
selected = np.array([embedding_list[most_similar]])
while len(idxs) < min(k, len(embedding_list)):
best_score = -np.inf
idx_to_add = -1
similarity_to_selected = cosine_similarity(embedding_list, selected)
for i, query_score in enumerate(similarity_to_query):
if i in idxs:
continue
redundant_score = max(similarity_to_selected[i])
equation_score = (
lambda_mult * query_score - (1 - lambda_mult) * redundant_score
)
if equation_score > best_score:
best_score = equation_score
idx_to_add = i
idxs.append(idx_to_add)
selected = np.append(selected, [embedding_list[idx_to_add]], axis=0)
return idxs
def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray:
"""Row-wise cosine similarity between two equal-width matrices.
Raises:
ValueError: If the number of columns in X and Y are not the same.
"""
if len(X) == 0 or len(Y) == 0:
return np.array([])
X = np.array(X)
Y = np.array(Y)
if X.shape[1] != Y.shape[1]:
raise ValueError(
"Number of columns in X and Y must be the same. X has shape"
f"{X.shape} "
f"and Y has shape {Y.shape}."
)
X_norm = np.linalg.norm(X, axis=1)
Y_norm = np.linalg.norm(Y, axis=1)
# Ignore divide by zero errors run time warnings as those are handled below.
with np.errstate(divide="ignore", invalid="ignore"):
similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm)
similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0
return similarity