From bb6c459f7a53745ede3a0a05329ebc21b401b550 Mon Sep 17 00:00:00 2001 From: Naveen Tatikonda Date: Thu, 20 Apr 2023 22:42:53 -0500 Subject: [PATCH] OpenSearch: Add Support for Lucene Filter (#3201) ### Description Add Support for Lucene Filter. When you specify a Lucene filter for a k-NN search, the Lucene algorithm decides whether to perform an exact k-NN search with pre-filtering or an approximate search with modified post-filtering. This filter is supported only for approximate search with the indexes that are created using `lucene` engine. OpenSearch Documentation - https://opensearch.org/docs/latest/search-plugins/knn/filter-search-knn/#lucene-k-nn-filter-implementation Signed-off-by: Naveen Tatikonda --- .../vectorstores/opensearch_vector_search.py | 33 +++++++++++++++++-- .../vectorstores/test_opensearch.py | 10 ++++++ 2 files changed, 41 insertions(+), 2 deletions(-) diff --git a/langchain/vectorstores/opensearch_vector_search.py b/langchain/vectorstores/opensearch_vector_search.py index d039497175b..b44857e276d 100644 --- a/langchain/vectorstores/opensearch_vector_search.py +++ b/langchain/vectorstores/opensearch_vector_search.py @@ -168,6 +168,21 @@ def _approximate_search_query_with_boolean_filter( } +def _approximate_search_query_with_lucene_filter( + query_vector: List[float], + lucene_filter: Dict, + size: int = 4, + k: int = 4, + vector_field: str = "vector_field", +) -> Dict: + """For Approximate k-NN Search, with Lucene Filter.""" + search_query = _default_approximate_search_query( + query_vector, size, k, vector_field + ) + search_query["query"]["knn"][vector_field]["filter"] = lucene_filter + return search_query + + def _default_script_query( query_vector: List[float], space_type: str = "l2", @@ -340,10 +355,14 @@ class OpenSearchVectorSearch(VectorStore): size: number of results the query actually returns; default: 4 boolean_filter: A Boolean filter consists of a Boolean query that - contains a k-NN query and a filter + contains a k-NN query and a filter. subquery_clause: Query clause on the knn vector field; default: "must" + lucene_filter: the Lucene algorithm decides whether to perform an exact + k-NN search with pre-filtering or an approximate search with modified + post-filtering. + Optional Args for Script Scoring Search: search_type: "script_scoring"; default: "approximate_search" @@ -371,10 +390,20 @@ class OpenSearchVectorSearch(VectorStore): size = _get_kwargs_value(kwargs, "size", 4) boolean_filter = _get_kwargs_value(kwargs, "boolean_filter", {}) subquery_clause = _get_kwargs_value(kwargs, "subquery_clause", "must") + lucene_filter = _get_kwargs_value(kwargs, "lucene_filter", {}) + if boolean_filter != {} and lucene_filter != {}: + raise ValueError( + "Both `boolean_filter` and `lucene_filter` are provided which " + "is invalid" + ) if boolean_filter != {}: search_query = _approximate_search_query_with_boolean_filter( embedding, boolean_filter, size, k, vector_field, subquery_clause ) + elif lucene_filter != {}: + search_query = _approximate_search_query_with_lucene_filter( + embedding, lucene_filter, size, k, vector_field + ) else: search_query = _default_approximate_search_query( embedding, size, k, vector_field @@ -442,7 +471,7 @@ class OpenSearchVectorSearch(VectorStore): to "text". Optional Keyword Args for Approximate Search: - engine: "nmslib", "faiss", "hnsw"; default: "nmslib" + engine: "nmslib", "faiss", "lucene"; default: "nmslib" space_type: "l2", "l1", "cosinesimil", "linf", "innerproduct"; default: "l2" diff --git a/tests/integration_tests/vectorstores/test_opensearch.py b/tests/integration_tests/vectorstores/test_opensearch.py index 92bfca4bc1d..36fb320864e 100644 --- a/tests/integration_tests/vectorstores/test_opensearch.py +++ b/tests/integration_tests/vectorstores/test_opensearch.py @@ -164,3 +164,13 @@ def test_appx_search_with_boolean_filter() -> None: "foo", k=3, boolean_filter=boolean_filter_val, subquery_clause="should" ) assert output == [Document(page_content="bar")] + + +def test_appx_search_with_lucene_filter() -> None: + """Test Approximate Search with Lucene Filter.""" + lucene_filter_val = {"bool": {"must": [{"term": {"text": "bar"}}]}} + docsearch = OpenSearchVectorSearch.from_texts( + texts, FakeEmbeddings(), opensearch_url=DEFAULT_OPENSEARCH_URL, engine="lucene" + ) + output = docsearch.similarity_search("foo", k=3, lucene_filter=lucene_filter_val) + assert output == [Document(page_content="bar")]