community[patch]: add possibility to search by vector in OpenSearchVectorSearch (#17878)

- **Description:** implements the missing `similarity_search_by_vector`
function for `OpenSearchVectorSearch`
- **Issue:** N/A
- **Dependencies:** N/A
This commit is contained in:
Karim Assi 2024-02-22 00:44:55 +01:00 committed by GitHub
parent 144f59b5fe
commit afc1ba0329
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -516,6 +516,15 @@ class OpenSearchVectorSearch(VectorStore):
docs_with_scores = self.similarity_search_with_score(query, k, **kwargs) docs_with_scores = self.similarity_search_with_score(query, k, **kwargs)
return [doc[0] for doc in docs_with_scores] return [doc[0] for doc in docs_with_scores]
def similarity_search_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Document]:
"""Return docs most similar to the embedding vector."""
docs_with_scores = self.similarity_search_with_score_by_vector(
embedding, k, **kwargs
)
return [doc[0] for doc in docs_with_scores]
def similarity_search_with_score( def similarity_search_with_score(
self, query: str, k: int = 4, **kwargs: Any self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]: ) -> List[Tuple[Document, float]]:
@ -534,19 +543,43 @@ class OpenSearchVectorSearch(VectorStore):
Optional Args: Optional Args:
same as `similarity_search` same as `similarity_search`
""" """
embedding = self.embedding_function.embed_query(query)
return self.similarity_search_with_score_by_vector(embedding, k, **kwargs)
def similarity_search_with_score_by_vector(
self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
"""Return docs and it's scores most similar to the embedding vector.
By default, supports Approximate Search.
Also supports Script Scoring and Painless Scripting.
Args:
embedding: Embedding vector to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents along with its scores most similar to the query.
Optional Args:
same as `similarity_search`
"""
text_field = kwargs.get("text_field", "text") text_field = kwargs.get("text_field", "text")
metadata_field = kwargs.get("metadata_field", "metadata") metadata_field = kwargs.get("metadata_field", "metadata")
hits = self._raw_similarity_search_with_score(query=query, k=k, **kwargs) hits = self._raw_similarity_search_with_score_by_vector(
embedding=embedding, k=k, **kwargs
)
documents_with_scores = [ documents_with_scores = [
( (
Document( Document(
page_content=hit["_source"][text_field], page_content=hit["_source"][text_field],
metadata=hit["_source"] metadata=(
hit["_source"]
if metadata_field == "*" or metadata_field not in hit["_source"] if metadata_field == "*" or metadata_field not in hit["_source"]
else hit["_source"][metadata_field], else hit["_source"][metadata_field]
),
), ),
hit["_score"], hit["_score"],
) )
@ -554,26 +587,25 @@ class OpenSearchVectorSearch(VectorStore):
] ]
return documents_with_scores return documents_with_scores
def _raw_similarity_search_with_score( def _raw_similarity_search_with_score_by_vector(
self, query: str, k: int = 4, **kwargs: Any self, embedding: List[float], k: int = 4, **kwargs: Any
) -> List[dict]: ) -> List[dict]:
"""Return raw opensearch documents (dict) including vectors, """Return raw opensearch documents (dict) including vectors,
scores most similar to query. scores most similar to the embedding vector.
By default, supports Approximate Search. By default, supports Approximate Search.
Also supports Script Scoring and Painless Scripting. Also supports Script Scoring and Painless Scripting.
Args: Args:
query: Text to look up documents similar to. embedding: Embedding vector to look up documents similar to.
k: Number of Documents to return. Defaults to 4. k: Number of Documents to return. Defaults to 4.
Returns: Returns:
List of dict with its scores most similar to the query. List of dict with its scores most similar to the embedding.
Optional Args: Optional Args:
same as `similarity_search` same as `similarity_search`
""" """
embedding = self.embedding_function.embed_query(query)
search_type = kwargs.get("search_type", "approximate_search") search_type = kwargs.get("search_type", "approximate_search")
vector_field = kwargs.get("vector_field", "vector_field") vector_field = kwargs.get("vector_field", "vector_field")
index_name = kwargs.get("index_name", self.index_name) index_name = kwargs.get("index_name", self.index_name)
@ -702,7 +734,9 @@ class OpenSearchVectorSearch(VectorStore):
embedding = self.embedding_function.embed_query(query) embedding = self.embedding_function.embed_query(query)
# Do ANN/KNN search to get top fetch_k results where fetch_k >= k # Do ANN/KNN search to get top fetch_k results where fetch_k >= k
results = self._raw_similarity_search_with_score(query, fetch_k, **kwargs) results = self._raw_similarity_search_with_score_by_vector(
embedding, fetch_k, **kwargs
)
embeddings = [result["_source"][vector_field] for result in results] embeddings = [result["_source"][vector_field] for result in results]