OpenSearch: Add Support for Lucene Filter (#3201)

### Description
Add Support for Lucene Filter. When you specify a Lucene filter for a
k-NN search, the Lucene algorithm decides whether to perform an exact
k-NN search with pre-filtering or an approximate search with modified
post-filtering. This filter is supported only for approximate search
with the indexes that are created using `lucene` engine.

OpenSearch Documentation -
https://opensearch.org/docs/latest/search-plugins/knn/filter-search-knn/#lucene-k-nn-filter-implementation

Signed-off-by: Naveen Tatikonda <navtat@amazon.com>
This commit is contained in:
Naveen Tatikonda 2023-04-20 22:42:53 -05:00 committed by GitHub
parent 36720cb57f
commit bb6c459f7a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 41 additions and 2 deletions

View File

@ -168,6 +168,21 @@ def _approximate_search_query_with_boolean_filter(
} }
def _approximate_search_query_with_lucene_filter(
query_vector: List[float],
lucene_filter: Dict,
size: int = 4,
k: int = 4,
vector_field: str = "vector_field",
) -> Dict:
"""For Approximate k-NN Search, with Lucene Filter."""
search_query = _default_approximate_search_query(
query_vector, size, k, vector_field
)
search_query["query"]["knn"][vector_field]["filter"] = lucene_filter
return search_query
def _default_script_query( def _default_script_query(
query_vector: List[float], query_vector: List[float],
space_type: str = "l2", space_type: str = "l2",
@ -340,10 +355,14 @@ class OpenSearchVectorSearch(VectorStore):
size: number of results the query actually returns; default: 4 size: number of results the query actually returns; default: 4
boolean_filter: A Boolean filter consists of a Boolean query that boolean_filter: A Boolean filter consists of a Boolean query that
contains a k-NN query and a filter contains a k-NN query and a filter.
subquery_clause: Query clause on the knn vector field; default: "must" subquery_clause: Query clause on the knn vector field; default: "must"
lucene_filter: the Lucene algorithm decides whether to perform an exact
k-NN search with pre-filtering or an approximate search with modified
post-filtering.
Optional Args for Script Scoring Search: Optional Args for Script Scoring Search:
search_type: "script_scoring"; default: "approximate_search" search_type: "script_scoring"; default: "approximate_search"
@ -371,10 +390,20 @@ class OpenSearchVectorSearch(VectorStore):
size = _get_kwargs_value(kwargs, "size", 4) size = _get_kwargs_value(kwargs, "size", 4)
boolean_filter = _get_kwargs_value(kwargs, "boolean_filter", {}) boolean_filter = _get_kwargs_value(kwargs, "boolean_filter", {})
subquery_clause = _get_kwargs_value(kwargs, "subquery_clause", "must") subquery_clause = _get_kwargs_value(kwargs, "subquery_clause", "must")
lucene_filter = _get_kwargs_value(kwargs, "lucene_filter", {})
if boolean_filter != {} and lucene_filter != {}:
raise ValueError(
"Both `boolean_filter` and `lucene_filter` are provided which "
"is invalid"
)
if boolean_filter != {}: if boolean_filter != {}:
search_query = _approximate_search_query_with_boolean_filter( search_query = _approximate_search_query_with_boolean_filter(
embedding, boolean_filter, size, k, vector_field, subquery_clause embedding, boolean_filter, size, k, vector_field, subquery_clause
) )
elif lucene_filter != {}:
search_query = _approximate_search_query_with_lucene_filter(
embedding, lucene_filter, size, k, vector_field
)
else: else:
search_query = _default_approximate_search_query( search_query = _default_approximate_search_query(
embedding, size, k, vector_field embedding, size, k, vector_field
@ -442,7 +471,7 @@ class OpenSearchVectorSearch(VectorStore):
to "text". to "text".
Optional Keyword Args for Approximate Search: Optional Keyword Args for Approximate Search:
engine: "nmslib", "faiss", "hnsw"; default: "nmslib" engine: "nmslib", "faiss", "lucene"; default: "nmslib"
space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2" space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2"

View File

@ -164,3 +164,13 @@ def test_appx_search_with_boolean_filter() -> None:
"foo", k=3, boolean_filter=boolean_filter_val, subquery_clause="should" "foo", k=3, boolean_filter=boolean_filter_val, subquery_clause="should"
) )
assert output == [Document(page_content="bar")] assert output == [Document(page_content="bar")]
def test_appx_search_with_lucene_filter() -> None:
"""Test Approximate Search with Lucene Filter."""
lucene_filter_val = {"bool": {"must": [{"term": {"text": "bar"}}]}}
docsearch = OpenSearchVectorSearch.from_texts(
texts, FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL, engine="lucene"
)
output = docsearch.similarity_search("foo", k=3, lucene_filter=lucene_filter_val)
assert output == [Document(page_content="bar")]