Azure Cognitive Search - update sdk b8, mod user agent, search with scores (#9191)

Description: Update Azure Cognitive Search SDK to version b8 (breaking
change)
Customizable User Agent.
Implemented Similarity search with scores 

@baskaryan

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Fabrizio Ruocco
2023-08-25 11:34:09 +02:00
committed by GitHub
parent 135cb86215
commit cacaf487c3
4 changed files with 111 additions and 36 deletions

View File

@@ -73,6 +73,7 @@ def _get_search_client(
scoring_profiles: Optional[List[ScoringProfile]] = None,
default_scoring_profile: Optional[str] = None,
default_fields: Optional[List[SearchField]] = None,
user_agent: Optional[str] = "langchain",
) -> SearchClient:
from azure.core.credentials import AzureKeyCredential
from azure.core.exceptions import ResourceNotFoundError
@@ -80,13 +81,13 @@ def _get_search_client(
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
HnswVectorSearchAlgorithmConfiguration,
PrioritizedFields,
SearchIndex,
SemanticConfiguration,
SemanticField,
SemanticSettings,
VectorSearch,
VectorSearchAlgorithmConfiguration,
)
default_fields = default_fields or []
@@ -95,7 +96,7 @@ def _get_search_client(
else:
credential = AzureKeyCredential(key)
index_client: SearchIndexClient = SearchIndexClient(
endpoint=endpoint, credential=credential, user_agent="langchain"
endpoint=endpoint, credential=credential, user_agent=user_agent
)
try:
index_client.get_index(name=index_name)
@@ -130,10 +131,10 @@ def _get_search_client(
if vector_search is None:
vector_search = VectorSearch(
algorithm_configurations=[
VectorSearchAlgorithmConfiguration(
HnswVectorSearchAlgorithmConfiguration(
name="default",
kind="hnsw",
hnsw_parameters={ # type: ignore
parameters={ # type: ignore
"m": 4,
"efConstruction": 400,
"efSearch": 500,
@@ -171,7 +172,7 @@ def _get_search_client(
endpoint=endpoint,
index_name=index_name,
credential=credential,
user_agent="langchain",
user_agent=user_agent,
)
@@ -227,6 +228,9 @@ class AzureSearch(VectorStore):
type=SearchFieldDataType.String,
),
]
user_agent = "langchain"
if "user_agent" in kwargs and kwargs["user_agent"]:
user_agent += " " + kwargs["user_agent"]
self.client = _get_search_client(
azure_search_endpoint,
azure_search_key,
@@ -238,6 +242,7 @@ class AzureSearch(VectorStore):
scoring_profiles=scoring_profiles,
default_scoring_profile=default_scoring_profile,
default_fields=default_fields,
user_agent=user_agent,
)
self.search_type = search_type
self.semantic_configuration_name = semantic_configuration_name
@@ -321,6 +326,17 @@ class AzureSearch(VectorStore):
raise ValueError(f"search_type of {search_type} not allowed.")
return docs
def similarity_search_with_relevance_scores(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Tuple[Document, float]]:
score_threshold = kwargs.pop("score_threshold", None)
result = self.vector_search_with_score(query, k=k, **kwargs)
return (
result
if score_threshold is None
else [r for r in result if r[1] >= score_threshold]
)
def vector_search(self, query: str, k: int = 4, **kwargs: Any) -> List[Document]:
"""
Returns the most similar indexed documents to the query text.
@@ -349,12 +365,19 @@ class AzureSearch(VectorStore):
Returns:
List of Documents most similar to the query and score for each
"""
from azure.search.documents.models import Vector
results = self.client.search(
search_text="",
vector=np.array(self.embedding_function(query), dtype=np.float32).tolist(),
top_k=k,
vector_fields=FIELDS_CONTENT_VECTOR,
vectors=[
Vector(
value=np.array(
self.embedding_function(query), dtype=np.float32
).tolist(),
k=k,
fields=FIELDS_CONTENT_VECTOR,
)
],
select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA],
filter=filters,
)
@@ -399,12 +422,19 @@ class AzureSearch(VectorStore):
Returns:
List of Documents most similar to the query and score for each
"""
from azure.search.documents.models import Vector
results = self.client.search(
search_text=query,
vector=np.array(self.embedding_function(query), dtype=np.float32).tolist(),
top_k=k,
vector_fields=FIELDS_CONTENT_VECTOR,
vectors=[
Vector(
value=np.array(
self.embedding_function(query), dtype=np.float32
).tolist(),
k=k,
fields=FIELDS_CONTENT_VECTOR,
)
],
select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA],
filter=filters,
top=k,
@@ -452,11 +482,19 @@ class AzureSearch(VectorStore):
Returns:
List of Documents most similar to the query and score for each
"""
from azure.search.documents.models import Vector
results = self.client.search(
search_text=query,
vector=np.array(self.embedding_function(query), dtype=np.float32).tolist(),
top_k=50, # Hardcoded value to maximize L2 retrieval
vector_fields=FIELDS_CONTENT_VECTOR,
vectors=[
Vector(
value=np.array(
self.embedding_function(query), dtype=np.float32
).tolist(),
k=50,
fields=FIELDS_CONTENT_VECTOR,
)
],
select=[FIELDS_ID, FIELDS_CONTENT, FIELDS_METADATA],
filter=filters,
query_type="semantic",

View File

@@ -719,13 +719,13 @@ msal-extensions = ">=0.3.0,<2.0.0"
[[package]]
name = "azure-search-documents"
version = "11.4.0b6"
version = "11.4.0b8"
description = "Microsoft Azure Cognitive Search Client Library for Python"
optional = true
python-versions = ">=3.7"
files = [
{file = "azure-search-documents-11.4.0b6.zip", hash = "sha256:c9ebd7d99d3c7b879f48acad66141e1f50eae4468cfb8389a4b25d4c620e8df1"},
{file = "azure_search_documents-11.4.0b6-py3-none-any.whl", hash = "sha256:24ff85bf2680c36b38d8092bcbbe2d90699aac7c4a228b0839c0ce595a41628c"},
{file = "azure-search-documents-11.4.0b8.zip", hash = "sha256:b178ff52918590191a9cb7f411a9ab3cb517663666a501a3e84b715d19b0d93b"},
{file = "azure_search_documents-11.4.0b8-py3-none-any.whl", hash = "sha256:4137daa2db75bff9484d394c16c0604822a51281cad2f50e11d7c48dd8d4b4cf"},
]
[package.dependencies]
@@ -10447,4 +10447,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "88e479307b19d991105360780f67ed3258ef1a0151f70b9e91c86c8153751e83"
content-hash = "43a6bd42efc0baf917418087f788aaf3b1bc793cb4aa81de99c52ed6a7d54d26"

View File

@@ -105,7 +105,7 @@ nebula3-python = {version = "^3.4.0", optional = true}
mwparserfromhell = {version = "^0.6.4", optional = true}
mwxml = {version = "^0.3.3", optional = true}
awadb = {version = "^0.3.9", optional = true}
azure-search-documents = {version = "11.4.0b6", optional = true}
azure-search-documents = {version = "11.4.0b8", optional = true}
esprima = {version = "^4.0.1", optional = true}
streamlit = {version = "^1.18.0", optional = true, python = ">=3.8.1,<3.9.7 || >3.9.7,<4.0"}
psychicapi = {version = "^0.8.0", optional = true}