diff --git a/libs/community/langchain_community/vectorstores/faiss.py b/libs/community/langchain_community/vectorstores/faiss.py index 48225888e59..67380614cb4 100644 --- a/libs/community/langchain_community/vectorstores/faiss.py +++ b/libs/community/langchain_community/vectorstores/faiss.py @@ -1346,8 +1346,11 @@ class FAISS(VectorStore): conditions for documents. Returns: - Callable[[Dict[str, Any]], bool]: A function that takes Document's metadata - and returns True if it satisfies the filter conditions, otherwise False. + A function that takes Document's metadata and returns True if it + satisfies the filter conditions, otherwise False. + + Raises: + ValueError: If the filter is invalid or contains unsuported operators. """ if callable(filter): return filter @@ -1357,12 +1360,118 @@ class FAISS(VectorStore): f"filter must be a dict of metadata or a callable, not {type(filter)}" ) - def filter_func(metadata: Dict[str, Any]) -> bool: - return all( - metadata.get(key) in value - if isinstance(value, list) - else metadata.get(key) == value - for key, value in filter.items() # type: ignore - ) + from operator import eq, ge, gt, le, lt, ne - return filter_func + COMPARISON_OPERATORS = { + "$eq": eq, + "$neq": ne, + "$gt": gt, + "$lt": lt, + "$gte": ge, + "$lte": le, + } + SEQUENCE_OPERATORS = { + "$in": lambda a, b: a in b, + "$nin": lambda a, b: a not in b, + } + OPERATIONS = COMPARISON_OPERATORS | SEQUENCE_OPERATORS + VALID_OPERATORS = frozenset(list(OPERATIONS) + ["$and", "$or", "$not"]) + SET_CONVERT_THRESHOLD = 10 + + # Validate top-level filter operators. + for op in filter: + if op and op.startswith("$") and op not in VALID_OPERATORS: + raise ValueError(f"filter contains unsupported operator: {op}") + + def filter_func_cond( + field: str, condition: Union[Dict[str, Any], List[Any], Any] + ) -> Callable[[Dict[str, Any]], bool]: + """ + Creates a filter function based on field and condition. + + Args: + field: The document field to filter on + condition: Filter condition (dict for operators, list for in, + or direct value for equality) + + Returns: + A filter function that takes a document and returns boolean + """ + if isinstance(condition, dict): + operators = [] + for op, value in condition.items(): + if op not in OPERATIONS: + raise ValueError(f"filter contains unsupported operator: {op}") + operators.append((OPERATIONS[op], value)) + + def filter_fn(doc: Dict[str, Any]) -> bool: + """ + Evaluates a document against a set of predefined operators + and their values. This function applies multiple + comparison/sequence operators to a specific field value + from the document. All conditions must be satisfied for the + function to return True. + + Args: + doc (Dict[str, Any]): The document to evaluate, containing + key-value pairs where keys are field names and values + are the field values. The document must contain the field + being filtered. + + Returns: + bool: True if the document's field value satisfies all + operator conditions, False otherwise. + """ + doc_value = doc.get(field) + return all(op(doc_value, value) for op, value in operators) + + return filter_fn + + if isinstance(condition, list): + if len(condition) > SET_CONVERT_THRESHOLD: + condition_set = frozenset(condition) + return lambda doc: doc.get(field) in condition_set + return lambda doc: doc.get(field) in condition + + return lambda doc: doc.get(field) == condition + + def filter_func(filter: Dict[str, Any]) -> Callable[[Dict[str, Any]], bool]: + """ + Creates a filter function that evaluates documents against specified + filter conditions. + + This function processes a dictionary of filter conditions and returns + a callable that can evaluate documents against these conditions. It + supports logical operators ($and, $or, $not) and field-level filtering. + + Args: + filter (Dict[str, Any]): A dictionary containing filter conditions. + Can include: + - Logical operators ($and, $or, $not) with lists of sub-filters + - Field-level conditions with comparison or sequence operators + - Direct field-value mappings for equality comparison + + Returns: + Callable[[Dict[str, Any]], bool]: A function that takes a document + (as a dictionary) and returns True if the document matches all + filter conditions, False otherwise. + """ + if "$and" in filter: + filters = [filter_func(sub_filter) for sub_filter in filter["$and"]] + return lambda doc: all(f(doc) for f in filters) + + if "$or" in filter: + filters = [filter_func(sub_filter) for sub_filter in filter["$or"]] + return lambda doc: any(f(doc) for f in filters) + + if "$not" in filter: + cond = filter_func(filter["$not"]) + return lambda doc: not cond(doc) + + conditions = [ + filter_func_cond(field, condition) + for field, condition in filter.items() + ] + return lambda doc: all(condition(doc) for condition in conditions) + + return filter_func(filter) diff --git a/libs/community/tests/unit_tests/vectorstores/test_faiss.py b/libs/community/tests/unit_tests/vectorstores/test_faiss.py index 99b4ba6e699..3fd9b5ee529 100644 --- a/libs/community/tests/unit_tests/vectorstores/test_faiss.py +++ b/libs/community/tests/unit_tests/vectorstores/test_faiss.py @@ -322,6 +322,808 @@ def test_faiss_mmr_with_metadatas_and_filter() -> None: ) +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_eq() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$eq": 1}} + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo", metadata={"page": 1}) + assert output[0][1] == 0.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] == 1 + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_neq() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$neq": 1}} + ) + assert len(output) == 3 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] != Document(page_content="foo", metadata={"page": 0}) + assert output[2][0] != Document(page_content="foo", metadata={"page": 0}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] != 1 + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_gt() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$gt": 0}} + ) + assert len(output) == 3 + assert output[0][0] == Document(page_content="foo", metadata={"page": 1}) + assert output[0][1] == 0.0 + assert output[1][0] != Document(page_content="foo", metadata={"page": 1}) + assert output[2][0] != Document(page_content="foo", metadata={"page": 1}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] > 0 + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_lt() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$lt": 2}} + ) + assert len(output) == 2 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] == Document(page_content="foo", metadata={"page": 1}) + assert output[1][1] == 1.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] < 2 + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_gte() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$gte": 1}} + ) + assert len(output) == 3 + assert output[0][0] == Document(page_content="foo", metadata={"page": 1}) + assert output[0][1] == 0.0 + assert output[1][0] != Document(page_content="foo", metadata={"page": 1}) + assert output[2][0] != Document(page_content="foo", metadata={"page": 1}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] >= 1 + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_lte() -> None: + texts = ["fou", "fou", "fouu", "fouuu"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$lte": 0}} + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="fou", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] <= 0 + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_in_1() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$in": [0]}} + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] in [0] + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_in_2() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$in": [1, 2]}} + ) + assert len(output) == 2 + assert output[0][0] == Document(page_content="foo", metadata={"page": 1}) + assert output[0][1] == 0.0 + assert output[1][0] == Document(page_content="fou", metadata={"page": 2}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] in [1, 2] + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_nin_1() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$nin": [0, 1]}} + ) + assert len(output) == 2 + assert output[0][0] == Document(page_content="fou", metadata={"page": 2}) + assert output[0][1] == 0.0 + assert output[1][0] == Document(page_content="foy", metadata={"page": 3}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] not in [0, 1] + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_nin_2() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$nin": [0, 1, 2]}} + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foy", metadata={"page": 3}) + assert output[0][1] == 0.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] not in [0, 1, 2] + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_operators_filter_not() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$not": {"page": 1}} + ) + assert len(output) == 3 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] == Document(page_content="foy", metadata={"page": 3}) + assert output[2][0] == Document(page_content="fou", metadata={"page": 2}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: not di["page"] == 1 + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_1() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$or": [{"page": 0}]} + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: (di["page"] == 0) + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_2() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$or": [{"page": 0}, {"page": 1}]} + ) + assert len(output) == 2 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] == Document(page_content="foo", metadata={"page": 1}) + assert output[1][1] == 1.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] == 0) or (di["page"] == 1), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_3() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={"$or": [{"page": 0}, {"page": 1}, {"page": 2}]}, + ) + assert len(output) == 3 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] != Document(page_content="foo", metadata={"page": 0}) + assert output[2][0] != Document(page_content="foo", metadata={"page": 0}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] == 0) or (di["page"] == 1) or (di["page"] == 2), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_1() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$and": [{"page": 0}]} + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: (di["page"] == 0) + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_2() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$and": [{"page": 0}, {"page": 1}]} + ) + assert len(output) == 0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] == 0) and (di["page"] == 1), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_3() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={"$and": [{"page": 0}, {"page": 1}, {"page": 2}]}, + ) + assert len(output) == 0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] == 0) and (di["page"] == 1) and (di["page"] == 2), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_4() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={"$and": [{"page": 0}, {"page": 0}, {"page": 0}]}, + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] == 0) and (di["page"] == 0) and (di["page"] == 0), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_1() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={"$and": [{"$or": [{"page": 1}, {"page": 2}]}, {"$not": {"page": 1}}]}, + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="fou", metadata={"page": 2}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] == 1 or di["page"] == 2) + and (not di["page"] == 1), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_2() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "$and": [ + {"$or": [{"page": 1}, {"page": 2}]}, + {"$or": [{"page": 3}, {"page": 2}, {"page": 0}]}, + ] + }, + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="fou", metadata={"page": 2}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] == 1 or di["page"] == 2) + and (di["page"] == 3 or di["page"] == 2 or di["page"] == 0), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_3() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "$or": [ + {"$and": [{"page": 1}, {"page": 2}]}, + {"$and": [{"page": 0}, {"page": 2}]}, + ] + }, + ) + assert len(output) == 0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] == 1 and di["page"] == 2) + or (di["page"] == 0 and di["page"] == 2), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_comparsion_operators_filter_1() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]}, + ) + assert len(output) == 2 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] == Document(page_content="foy", metadata={"page": 3}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: di["page"] < 1 or di["page"] > 2, + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_logical_comparsion_operators_filter_2() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$not": {"page": {"$lt": 1}}} + ) + assert len(output) == 3 + assert output[0][0] == Document(page_content="foo", metadata={"page": 1}) + assert output[0][1] == 0.0 + assert output[1][0] == Document(page_content="foy", metadata={"page": 3}) + assert output[2][0] == Document(page_content="fou", metadata={"page": 2}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: not di["page"] < 1 + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_1() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "$and": [ + {"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]}, + {"$or": [{"page": {"$eq": 0}}, {"page": {"$eq": 1}}]}, + ] + }, + ) + assert len(output) == 1 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] < 1 or di["page"] > 2) + and (di["page"] == 0 or di["page"] == 1), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_2() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "$and": [ + {"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]}, + {"$not": {"page": {"$in": [0]}}}, + {"page": {"$neq": 3}}, + ] + }, + ) + assert len(output) == 0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] < 1 or di["page"] > 2) + and (di["page"] not in [0]) + and (di["page"] != 3), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_3() -> None: + texts = ["foo", "foo", "fou", "foy"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "$or": [ + {"$and": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]}, + {"$not": {"page": {"$nin": [0]}}}, + {"page": {"$eq": 3}}, + ] + }, + ) + assert len(output) == 2 + assert output[0][0] == Document(page_content="foo", metadata={"page": 0}) + assert output[0][1] == 0.0 + assert output[1][0] == Document(page_content="foy", metadata={"page": 3}) + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: (di["page"] < 1 and di["page"] > 2) + or (not di["page"] not in [0]) + or (di["page"] == 3), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_empty_conditions() -> None: + """Test with an empty filter condition.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={} + ) + + assert len(output) == 3 + assert all(doc[0].page_content in ["foo", "bar", "baz"] for doc in output) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadates_and_empty_and_operator() -> None: + """Test with an empty $and operator.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using an empty $and filter + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$and": []} + ) + + assert len(output) == 3 + assert all(doc[0].page_content in ["foo", "bar", "baz"] for doc in output) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_empty_or_operator() -> None: + """Test with an empty $or operator.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using an empty $or filter + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$or": []} + ) + + assert len(output) == 0 + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_nonexistent_field() -> None: + """Test with a non-existent field in the metadata.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using a filter with a non-existent field + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"nonexistent_field": {"$eq": 1}} + ) + + assert len(output) == 0 # Expecting no documents to match + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_invalid_logical_operator() -> None: + """Test with an invalid logical operator key.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using a filter with an invalid logical operator + with pytest.raises(ValueError, match="unsupported operator"): + docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"$unknown": [{"page": 1}]} + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_invalid_comparison_operator() -> None: + """Test with an invalid comparison operator key.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using a filter with an invalid comparison operator + with pytest.raises(ValueError, match="unsupported operator"): + docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$invalid_operator": 1}} + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_valid_invalid_fields() -> None: + """Test with logical operators combining valid and invalid field.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using a filter with $and combining valid and invalid fields + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "$and": [ + {"page": {"$eq": 1}}, # Valid field + {"invalid_field": {"$eq": 1}}, # Invalid field + ] + }, + ) + # Expecting no documents to match due to the invalid field + assert len(output) == 0 + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_valid_and_invalid_operators() -> None: + """Test with logical operators combining valid and invalid operators.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using a filter with $and combining valid and invalid operators + with pytest.raises(ValueError, match="unsupported operator"): + docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "$and": [ + {"page": {"$eq": 1}}, # Valid condition + {"page": {"$unknown": 2}}, # Invalid operator + ] + }, + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_multiple_nested_logical_operators() -> None: + """Test with multiple nested logical operators.""" + texts = ["foo", "bar", "baz", "qux", "quux"] + metadatas = [ + {"page": 1, "chapter": 1, "section": 3}, + {"page": 2, "chapter": 2, "section": 4}, + {"page": 1, "chapter": 3, "section": 6}, + {"page": 3, "chapter": 2, "section": 5}, + {"page": 4, "chapter": 1, "section": 2}, + ] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using a filter with multiple nested logical operators + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "$and": [ + {"$or": [{"page": {"$eq": 1}}, {"chapter": {"$gt": 2}}]}, + {"$not": {"section": {"$lte": 5}}}, + ] + }, + ) + assert len(output) > 0 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: ( + (di["page"] == 1 or di["chapter"] > 2) and di["section"] > 5 + ), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_mixed_data_types() -> None: + """Test with metadata containing mixed data types (numbers, strings, booleans).""" + texts = ["foo", "bar", "baz", "qux", "quux"] + metadatas: list[dict] = [ + {"page": "1", "isActive": True, "priority": 2.5}, + {"page": 2, "isActive": False, "priority": 3.0}, + {"page": 3, "isActive": True, "priority": 1.5}, + {"page": 1, "isActive": True, "priority": 4.0}, + {"page": 4, "isActive": False, "priority": 2.0}, + ] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + # Using a filter with mixed data types + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={ + "page": {"$eq": "1"}, # String comparison + "isActive": {"$eq": True}, # Boolean comparison + "priority": {"$gt": 2.0}, # Numeric comparison + }, + ) + # Assert output matches expected results based on the filter conditions + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter=lambda di: ( + di["page"] == "1" and di["isActive"] is True and di["priority"] > 2.0 + ), + ) + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_conflicting_conditions() -> None: + """Test with conflicting conditions in filters.""" + texts = ["foo", "bar", "baz"] + metadatas = [{"page": i} for i in range(len(texts))] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, + k=10, + lambda_mult=0.1, + filter={"$and": [{"page": {"$eq": 1}}, {"page": {"$eq": 2}}]}, + ) + # Assert that the output is empty due to conflicting conditions + assert len(output) == 0 + + +@pytest.mark.requires("faiss") +def test_faiss_mmr_with_metadatas_and_null_field_values() -> None: + """Test with fields that have null or undefined values.""" + texts = ["foo", "bar", "baz", "qux"] + metadatas: list[dict] = [{"page": 1}, {"page": None}, {"page": 2}, {"page": None}] + docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas) + query_vec = FakeEmbeddings().embed_query(text="foo") + # Using a filter to find documents where page is null + output = docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter={"page": {"$eq": None}} + ) + assert len(output) == 2 + assert output == docsearch.max_marginal_relevance_search_with_score_by_vector( + query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] is None + ) + + @pytest.mark.requires("faiss") async def test_faiss_async_mmr_with_metadatas_and_filter() -> None: texts = ["foo", "foo", "fou", "foy"]