mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-31 10:23:18 +00:00
[Matching Engine] Update the Matching Engine to include the distance and filters (#12555)
Hello 👋, This Pull Request adds more capability to the [MatchingEngine](https://api.python.langchain.com/en/latest/vectorstores/langchain.vectorstores.matching_engine.MatchingEngine.html) vectorstore of GCP. It includes the `similarity_search_by_vector_with_relevance_scores` function and also [filters](https://cloud.google.com/vertex-ai/docs/vector-search/filtering) to `filter` the namespaces when retrieving the results. - **Description:** Add [filter](https://cloud.google.com/python/docs/reference/aiplatform/latest/google.cloud.aiplatform.MatchingEngineIndexEndpoint#google_cloud_aiplatform_MatchingEngineIndexEndpoint_find_neighbors) in `similarity_search` and add `similarity_search_by_vector_with_relevance_scores` method - **Dependencies:** None - **Tag maintainer:** Unknown Thank you! --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
@@ -4,7 +4,7 @@ import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type
|
||||
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
from langchain.schema.document import Document
|
||||
from langchain.schema.embeddings import Embeddings
|
||||
@@ -14,6 +14,9 @@ from langchain.utilities.vertexai import get_client_info
|
||||
if TYPE_CHECKING:
|
||||
from google.cloud import storage
|
||||
from google.cloud.aiplatform import MatchingEngineIndex, MatchingEngineIndexEndpoint
|
||||
from google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint import (
|
||||
Namespace,
|
||||
)
|
||||
from google.oauth2.service_account import Credentials
|
||||
|
||||
from langchain.embeddings import TensorflowHubEmbeddings
|
||||
@@ -169,41 +172,85 @@ class MatchingEngine(VectorStore):
|
||||
blob = bucket.blob(gcs_location)
|
||||
blob.upload_from_string(data)
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query.
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[List[Namespace]] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to query and their cosine distance from the query.
|
||||
|
||||
Args:
|
||||
query: The string that will be used to search for similar documents.
|
||||
k: The amount of neighbors that will be retrieved.
|
||||
query: String query look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter: Optional. A list of Namespaces for filtering
|
||||
the matching results.
|
||||
For example:
|
||||
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
|
||||
will match datapoints that satisfy "red color" but not include
|
||||
datapoints with "squared shape". Please refer to
|
||||
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
|
||||
for more detail.
|
||||
|
||||
Returns:
|
||||
A list of k matching documents.
|
||||
List[Tuple[Document, float]]: List of documents most similar to
|
||||
the query text and cosine distance in float for each.
|
||||
Lower score represents more similarity.
|
||||
"""
|
||||
|
||||
logger.debug(f"Embedding query {query}.")
|
||||
embedding_query = self.embedding.embed_documents([query])
|
||||
embedding_query = self.embedding.embed_query(query)
|
||||
return self.similarity_search_by_vector_with_score(
|
||||
embedding_query, k=k, filter=filter
|
||||
)
|
||||
|
||||
def similarity_search_by_vector_with_score(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[List[Namespace]] = None,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs most similar to the embedding and their cosine distance.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter: Optional. A list of Namespaces for filtering
|
||||
the matching results.
|
||||
For example:
|
||||
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
|
||||
will match datapoints that satisfy "red color" but not include
|
||||
datapoints with "squared shape". Please refer to
|
||||
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
|
||||
for more detail.
|
||||
|
||||
Returns:
|
||||
List[Tuple[Document, float]]: List of documents most similar to
|
||||
the query text and cosine distance in float for each.
|
||||
Lower score represents more similarity.
|
||||
"""
|
||||
filter = filter or []
|
||||
|
||||
# If the endpoint is public we use the find_neighbors function.
|
||||
if self.endpoint._public_match_client:
|
||||
response = self.endpoint.find_neighbors(
|
||||
deployed_index_id=self._get_index_id(),
|
||||
queries=embedding_query,
|
||||
queries=[embedding],
|
||||
num_neighbors=k,
|
||||
filter=filter,
|
||||
)
|
||||
else:
|
||||
response = self.endpoint.match(
|
||||
deployed_index_id=self._get_index_id(),
|
||||
queries=embedding_query,
|
||||
queries=[embedding],
|
||||
num_neighbors=k,
|
||||
filter=filter,
|
||||
)
|
||||
|
||||
logger.debug(f"Found {len(response)} matches.")
|
||||
|
||||
if len(response) == 0:
|
||||
return []
|
||||
|
||||
logger.debug(f"Found {len(response)} matches for the query {query}.")
|
||||
|
||||
results = []
|
||||
|
||||
# I'm only getting the first one because queries receives an array
|
||||
@@ -212,12 +259,70 @@ class MatchingEngine(VectorStore):
|
||||
# one element.
|
||||
for doc in response[0]:
|
||||
page_content = self._download_from_gcs(f"documents/{doc.id}")
|
||||
results.append(Document(page_content=page_content))
|
||||
results.append((Document(page_content=page_content), doc.distance))
|
||||
|
||||
logger.debug("Downloaded documents for query.")
|
||||
|
||||
return results
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = 4,
|
||||
filter: Optional[List[Namespace]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query.
|
||||
|
||||
Args:
|
||||
query: The string that will be used to search for similar documents.
|
||||
k: The amount of neighbors that will be retrieved.
|
||||
filter: Optional. A list of Namespaces for filtering the matching results.
|
||||
For example:
|
||||
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
|
||||
will match datapoints that satisfy "red color" but not include
|
||||
datapoints with "squared shape". Please refer to
|
||||
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
|
||||
for more detail.
|
||||
|
||||
Returns:
|
||||
A list of k matching documents.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_with_score(
|
||||
query, k=k, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = 4,
|
||||
filter: Optional[List[Namespace]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to the embedding.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: The amount of neighbors that will be retrieved.
|
||||
filter: Optional. A list of Namespaces for filtering the matching results.
|
||||
For example:
|
||||
[Namespace("color", ["red"], []), Namespace("shape", [], ["squared"])]
|
||||
will match datapoints that satisfy "red color" but not include
|
||||
datapoints with "squared shape". Please refer to
|
||||
https://cloud.google.com/vertex-ai/docs/matching-engine/filtering#json
|
||||
for more detail.
|
||||
|
||||
Returns:
|
||||
A list of k matching documents.
|
||||
"""
|
||||
docs_and_scores = self.similarity_search_by_vector_with_score(
|
||||
embedding, k=k, filter=filter, **kwargs
|
||||
)
|
||||
|
||||
return [doc for doc, _ in docs_and_scores]
|
||||
|
||||
def _get_index_id(self) -> str:
|
||||
"""Gets the correct index id for the endpoint.
|
||||
|
||||
|
Reference in New Issue
Block a user