community[minor]: FAISS Filter Function Enhancement with Advanced Query Operators (#28207)

## Description
We are submitting as a team of four for a project. Other team members
are @RuofanChen03, @LikeWang10067, @TANYAL77.

This pull requests expands the filtering capabilities of the FAISS
vectorstore by adding MongoDB-style query operators indicated as
follows, while including comprehensive testing for the added
functionality.
- $eq (equals)
- $neq (not equals)
- $gt (greater than)
- $lt (less than)
- $gte (greater than or equal)
- $lte (less than or equal)
- $in (membership in list)
- $nin (not in list)
- $and (all conditions must match)
- $or (any condition must match)
- $not (negation of condition)


## Issue
This closes https://github.com/langchain-ai/langchain/issues/26379.


## Sample Usage
```python
import faiss
import asyncio
from langchain_community.vectorstores import FAISS
from langchain.schema import Document
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
documents = [
    Document(page_content="Process customer refund request", metadata={"schema_type": "financial", "handler_type": "refund",}),
    Document(page_content="Update customer shipping address", metadata={"schema_type": "customer", "handler_type": "update",}),
    Document(page_content="Process payment transaction", metadata={"schema_type": "financial", "handler_type": "payment",}),
    Document(page_content="Handle customer complaint", metadata={"schema_type": "customer","handler_type": "complaint",}),
    Document(page_content="Process invoice payment", metadata={"schema_type": "financial","handler_type": "payment",})
]

async def search(vectorstore, query, schema_type, handler_type, k=2):
    schema_filter = {"schema_type": {"$eq": schema_type}}
    handler_filter = {"handler_type": {"$eq": handler_type}}
    combined_filter = {
        "$and": [
            schema_filter,
            handler_filter,
        ]
    }
    base_retriever = vectorstore.as_retriever(
        search_kwargs={"k":k, "filter":combined_filter}
    )
    return await base_retriever.ainvoke(query)

async def main():
    vectorstore = FAISS.from_texts(
        texts=[doc.page_content for doc in documents],
        embedding=embeddings,
        metadatas=[doc.metadata for doc in documents]
    )
    
    def printt(title, documents):
        print(title)
        if not documents:
            print("\tNo documents found.")
            return
        for doc in documents:
            print(f"\t{doc.page_content}. {doc.metadata}")

    printt("Documents:", documents)
    printt('\nquery="process payment", schema_type="financial", handler_type="payment":', await search(vectorstore, query="process payment", schema_type="financial", handler_type="payment", k=2))
    printt('\nquery="customer update", schema_type="customer", handler_type="update":', await search(vectorstore, query="customer update", schema_type="customer", handler_type="update", k=2))
    printt('\nquery="refund process", schema_type="financial", handler_type="refund":', await search(vectorstore, query="refund process", schema_type="financial", handler_type="refund", k=2))
    printt('\nquery="refund process", schema_type="financial", handler_type="foobar":', await search(vectorstore, query="refund process", schema_type="financial", handler_type="foobar", k=2))
    print()

if __name__ == "__main__":asyncio.run(main())
```

## Output
```
Documents:
	Process customer refund request. {'schema_type': 'financial', 'handler_type': 'refund'}
	Update customer shipping address. {'schema_type': 'customer', 'handler_type': 'update'}
	Process payment transaction. {'schema_type': 'financial', 'handler_type': 'payment'}
	Handle customer complaint. {'schema_type': 'customer', 'handler_type': 'complaint'}
	Process invoice payment. {'schema_type': 'financial', 'handler_type': 'payment'}

query="process payment", schema_type="financial", handler_type="payment":
	Process payment transaction. {'schema_type': 'financial', 'handler_type': 'payment'}
	Process invoice payment. {'schema_type': 'financial', 'handler_type': 'payment'}

query="customer update", schema_type="customer", handler_type="update":
	Update customer shipping address. {'schema_type': 'customer', 'handler_type': 'update'}

query="refund process", schema_type="financial", handler_type="refund":
	Process customer refund request. {'schema_type': 'financial', 'handler_type': 'refund'}

query="refund process", schema_type="financial", handler_type="foobar":
	No documents found.

```

---------

Co-authored-by: ruofan chen <ruofan.is.awesome@gmail.com>
Co-authored-by: RickyCowboy <like.wang@mail.utoronto.ca>
Co-authored-by: Shanni Li <tanya.li@mail.utoronto.ca>
Co-authored-by: RuofanChen03 <114096642+ruofanchen03@users.noreply.github.com>
Co-authored-by: Like Wang <102838708+likewang10067@users.noreply.github.com>
This commit is contained in:
Vincent Zhang 2024-12-11 17:52:22 -05:00 committed by GitHub
parent b9dd4f2985
commit df5008fe55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 921 additions and 10 deletions

View File

@ -1346,8 +1346,11 @@ class FAISS(VectorStore):
conditions for documents.
Returns:
Callable[[Dict[str, Any]], bool]: A function that takes Document's metadata
and returns True if it satisfies the filter conditions, otherwise False.
A function that takes Document's metadata and returns True if it
satisfies the filter conditions, otherwise False.
Raises:
ValueError: If the filter is invalid or contains unsuported operators.
"""
if callable(filter):
return filter
@ -1357,12 +1360,118 @@ class FAISS(VectorStore):
f"filter must be a dict of metadata or a callable, not {type(filter)}"
)
def filter_func(metadata: Dict[str, Any]) -> bool:
return all(
metadata.get(key) in value
if isinstance(value, list)
else metadata.get(key) == value
for key, value in filter.items() # type: ignore
)
from operator import eq, ge, gt, le, lt, ne
return filter_func
COMPARISON_OPERATORS = {
"$eq": eq,
"$neq": ne,
"$gt": gt,
"$lt": lt,
"$gte": ge,
"$lte": le,
}
SEQUENCE_OPERATORS = {
"$in": lambda a, b: a in b,
"$nin": lambda a, b: a not in b,
}
OPERATIONS = COMPARISON_OPERATORS | SEQUENCE_OPERATORS
VALID_OPERATORS = frozenset(list(OPERATIONS) + ["$and", "$or", "$not"])
SET_CONVERT_THRESHOLD = 10
# Validate top-level filter operators.
for op in filter:
if op and op.startswith("$") and op not in VALID_OPERATORS:
raise ValueError(f"filter contains unsupported operator: {op}")
def filter_func_cond(
field: str, condition: Union[Dict[str, Any], List[Any], Any]
) -> Callable[[Dict[str, Any]], bool]:
"""
Creates a filter function based on field and condition.
Args:
field: The document field to filter on
condition: Filter condition (dict for operators, list for in,
or direct value for equality)
Returns:
A filter function that takes a document and returns boolean
"""
if isinstance(condition, dict):
operators = []
for op, value in condition.items():
if op not in OPERATIONS:
raise ValueError(f"filter contains unsupported operator: {op}")
operators.append((OPERATIONS[op], value))
def filter_fn(doc: Dict[str, Any]) -> bool:
"""
Evaluates a document against a set of predefined operators
and their values. This function applies multiple
comparison/sequence operators to a specific field value
from the document. All conditions must be satisfied for the
function to return True.
Args:
doc (Dict[str, Any]): The document to evaluate, containing
key-value pairs where keys are field names and values
are the field values. The document must contain the field
being filtered.
Returns:
bool: True if the document's field value satisfies all
operator conditions, False otherwise.
"""
doc_value = doc.get(field)
return all(op(doc_value, value) for op, value in operators)
return filter_fn
if isinstance(condition, list):
if len(condition) > SET_CONVERT_THRESHOLD:
condition_set = frozenset(condition)
return lambda doc: doc.get(field) in condition_set
return lambda doc: doc.get(field) in condition
return lambda doc: doc.get(field) == condition
def filter_func(filter: Dict[str, Any]) -> Callable[[Dict[str, Any]], bool]:
"""
Creates a filter function that evaluates documents against specified
filter conditions.
This function processes a dictionary of filter conditions and returns
a callable that can evaluate documents against these conditions. It
supports logical operators ($and, $or, $not) and field-level filtering.
Args:
filter (Dict[str, Any]): A dictionary containing filter conditions.
Can include:
- Logical operators ($and, $or, $not) with lists of sub-filters
- Field-level conditions with comparison or sequence operators
- Direct field-value mappings for equality comparison
Returns:
Callable[[Dict[str, Any]], bool]: A function that takes a document
(as a dictionary) and returns True if the document matches all
filter conditions, False otherwise.
"""
if "$and" in filter:
filters = [filter_func(sub_filter) for sub_filter in filter["$and"]]
return lambda doc: all(f(doc) for f in filters)
if "$or" in filter:
filters = [filter_func(sub_filter) for sub_filter in filter["$or"]]
return lambda doc: any(f(doc) for f in filters)
if "$not" in filter:
cond = filter_func(filter["$not"])
return lambda doc: not cond(doc)
conditions = [
filter_func_cond(field, condition)
for field, condition in filter.items()
]
return lambda doc: all(condition(doc) for condition in conditions)
return filter_func(filter)

View File

@ -322,6 +322,808 @@ def test_faiss_mmr_with_metadatas_and_filter() -> None:
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_eq() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$eq": 1}}
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
assert output[0][1] == 0.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] == 1
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_neq() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$neq": 1}}
)
assert len(output) == 3
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] != Document(page_content="foo", metadata={"page": 0})
assert output[2][0] != Document(page_content="foo", metadata={"page": 0})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] != 1
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_gt() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$gt": 0}}
)
assert len(output) == 3
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
assert output[0][1] == 0.0
assert output[1][0] != Document(page_content="foo", metadata={"page": 1})
assert output[2][0] != Document(page_content="foo", metadata={"page": 1})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] > 0
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_lt() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$lt": 2}}
)
assert len(output) == 2
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] == Document(page_content="foo", metadata={"page": 1})
assert output[1][1] == 1.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] < 2
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_gte() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$gte": 1}}
)
assert len(output) == 3
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
assert output[0][1] == 0.0
assert output[1][0] != Document(page_content="foo", metadata={"page": 1})
assert output[2][0] != Document(page_content="foo", metadata={"page": 1})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] >= 1
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_lte() -> None:
texts = ["fou", "fou", "fouu", "fouuu"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$lte": 0}}
)
assert len(output) == 1
assert output[0][0] == Document(page_content="fou", metadata={"page": 0})
assert output[0][1] == 0.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] <= 0
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_in_1() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$in": [0]}}
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] in [0]
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_in_2() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$in": [1, 2]}}
)
assert len(output) == 2
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
assert output[0][1] == 0.0
assert output[1][0] == Document(page_content="fou", metadata={"page": 2})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] in [1, 2]
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_nin_1() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$nin": [0, 1]}}
)
assert len(output) == 2
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
assert output[0][1] == 0.0
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] not in [0, 1]
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_nin_2() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$nin": [0, 1, 2]}}
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foy", metadata={"page": 3})
assert output[0][1] == 0.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] not in [0, 1, 2]
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_not() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$not": {"page": 1}}
)
assert len(output) == 3
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
assert output[2][0] == Document(page_content="fou", metadata={"page": 2})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: not di["page"] == 1
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_1() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$or": [{"page": 0}]}
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: (di["page"] == 0)
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_2() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$or": [{"page": 0}, {"page": 1}]}
)
assert len(output) == 2
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] == Document(page_content="foo", metadata={"page": 1})
assert output[1][1] == 1.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] == 0) or (di["page"] == 1),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_3() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={"$or": [{"page": 0}, {"page": 1}, {"page": 2}]},
)
assert len(output) == 3
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] != Document(page_content="foo", metadata={"page": 0})
assert output[2][0] != Document(page_content="foo", metadata={"page": 0})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] == 0) or (di["page"] == 1) or (di["page"] == 2),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_1() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$and": [{"page": 0}]}
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: (di["page"] == 0)
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_2() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$and": [{"page": 0}, {"page": 1}]}
)
assert len(output) == 0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] == 0) and (di["page"] == 1),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_3() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={"$and": [{"page": 0}, {"page": 1}, {"page": 2}]},
)
assert len(output) == 0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] == 0) and (di["page"] == 1) and (di["page"] == 2),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_4() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={"$and": [{"page": 0}, {"page": 0}, {"page": 0}]},
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] == 0) and (di["page"] == 0) and (di["page"] == 0),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_1() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={"$and": [{"$or": [{"page": 1}, {"page": 2}]}, {"$not": {"page": 1}}]},
)
assert len(output) == 1
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] == 1 or di["page"] == 2)
and (not di["page"] == 1),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_2() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"$and": [
{"$or": [{"page": 1}, {"page": 2}]},
{"$or": [{"page": 3}, {"page": 2}, {"page": 0}]},
]
},
)
assert len(output) == 1
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] == 1 or di["page"] == 2)
and (di["page"] == 3 or di["page"] == 2 or di["page"] == 0),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_3() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"$or": [
{"$and": [{"page": 1}, {"page": 2}]},
{"$and": [{"page": 0}, {"page": 2}]},
]
},
)
assert len(output) == 0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] == 1 and di["page"] == 2)
or (di["page"] == 0 and di["page"] == 2),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_comparsion_operators_filter_1() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
)
assert len(output) == 2
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: di["page"] < 1 or di["page"] > 2,
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_logical_comparsion_operators_filter_2() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$not": {"page": {"$lt": 1}}}
)
assert len(output) == 3
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
assert output[0][1] == 0.0
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
assert output[2][0] == Document(page_content="fou", metadata={"page": 2})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: not di["page"] < 1
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_1() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"$and": [
{"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
{"$or": [{"page": {"$eq": 0}}, {"page": {"$eq": 1}}]},
]
},
)
assert len(output) == 1
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] < 1 or di["page"] > 2)
and (di["page"] == 0 or di["page"] == 1),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_2() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"$and": [
{"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
{"$not": {"page": {"$in": [0]}}},
{"page": {"$neq": 3}},
]
},
)
assert len(output) == 0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] < 1 or di["page"] > 2)
and (di["page"] not in [0])
and (di["page"] != 3),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_3() -> None:
texts = ["foo", "foo", "fou", "foy"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"$or": [
{"$and": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
{"$not": {"page": {"$nin": [0]}}},
{"page": {"$eq": 3}},
]
},
)
assert len(output) == 2
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
assert output[0][1] == 0.0
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (di["page"] < 1 and di["page"] > 2)
or (not di["page"] not in [0])
or (di["page"] == 3),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_empty_conditions() -> None:
"""Test with an empty filter condition."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={}
)
assert len(output) == 3
assert all(doc[0].page_content in ["foo", "bar", "baz"] for doc in output)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadates_and_empty_and_operator() -> None:
"""Test with an empty $and operator."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using an empty $and filter
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$and": []}
)
assert len(output) == 3
assert all(doc[0].page_content in ["foo", "bar", "baz"] for doc in output)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_empty_or_operator() -> None:
"""Test with an empty $or operator."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using an empty $or filter
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$or": []}
)
assert len(output) == 0
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_nonexistent_field() -> None:
"""Test with a non-existent field in the metadata."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using a filter with a non-existent field
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"nonexistent_field": {"$eq": 1}}
)
assert len(output) == 0 # Expecting no documents to match
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_invalid_logical_operator() -> None:
"""Test with an invalid logical operator key."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using a filter with an invalid logical operator
with pytest.raises(ValueError, match="unsupported operator"):
docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"$unknown": [{"page": 1}]}
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_invalid_comparison_operator() -> None:
"""Test with an invalid comparison operator key."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using a filter with an invalid comparison operator
with pytest.raises(ValueError, match="unsupported operator"):
docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$invalid_operator": 1}}
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_valid_invalid_fields() -> None:
"""Test with logical operators combining valid and invalid field."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using a filter with $and combining valid and invalid fields
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"$and": [
{"page": {"$eq": 1}}, # Valid field
{"invalid_field": {"$eq": 1}}, # Invalid field
]
},
)
# Expecting no documents to match due to the invalid field
assert len(output) == 0
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_valid_and_invalid_operators() -> None:
"""Test with logical operators combining valid and invalid operators."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using a filter with $and combining valid and invalid operators
with pytest.raises(ValueError, match="unsupported operator"):
docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"$and": [
{"page": {"$eq": 1}}, # Valid condition
{"page": {"$unknown": 2}}, # Invalid operator
]
},
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_multiple_nested_logical_operators() -> None:
"""Test with multiple nested logical operators."""
texts = ["foo", "bar", "baz", "qux", "quux"]
metadatas = [
{"page": 1, "chapter": 1, "section": 3},
{"page": 2, "chapter": 2, "section": 4},
{"page": 1, "chapter": 3, "section": 6},
{"page": 3, "chapter": 2, "section": 5},
{"page": 4, "chapter": 1, "section": 2},
]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using a filter with multiple nested logical operators
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"$and": [
{"$or": [{"page": {"$eq": 1}}, {"chapter": {"$gt": 2}}]},
{"$not": {"section": {"$lte": 5}}},
]
},
)
assert len(output) > 0
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (
(di["page"] == 1 or di["chapter"] > 2) and di["section"] > 5
),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_mixed_data_types() -> None:
"""Test with metadata containing mixed data types (numbers, strings, booleans)."""
texts = ["foo", "bar", "baz", "qux", "quux"]
metadatas: list[dict] = [
{"page": "1", "isActive": True, "priority": 2.5},
{"page": 2, "isActive": False, "priority": 3.0},
{"page": 3, "isActive": True, "priority": 1.5},
{"page": 1, "isActive": True, "priority": 4.0},
{"page": 4, "isActive": False, "priority": 2.0},
]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using a filter with mixed data types
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={
"page": {"$eq": "1"}, # String comparison
"isActive": {"$eq": True}, # Boolean comparison
"priority": {"$gt": 2.0}, # Numeric comparison
},
)
# Assert output matches expected results based on the filter conditions
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter=lambda di: (
di["page"] == "1" and di["isActive"] is True and di["priority"] > 2.0
),
)
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_conflicting_conditions() -> None:
"""Test with conflicting conditions in filters."""
texts = ["foo", "bar", "baz"]
metadatas = [{"page": i} for i in range(len(texts))]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec,
k=10,
lambda_mult=0.1,
filter={"$and": [{"page": {"$eq": 1}}, {"page": {"$eq": 2}}]},
)
# Assert that the output is empty due to conflicting conditions
assert len(output) == 0
@pytest.mark.requires("faiss")
def test_faiss_mmr_with_metadatas_and_null_field_values() -> None:
"""Test with fields that have null or undefined values."""
texts = ["foo", "bar", "baz", "qux"]
metadatas: list[dict] = [{"page": 1}, {"page": None}, {"page": 2}, {"page": None}]
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
query_vec = FakeEmbeddings().embed_query(text="foo")
# Using a filter to find documents where page is null
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$eq": None}}
)
assert len(output) == 2
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] is None
)
@pytest.mark.requires("faiss")
async def test_faiss_async_mmr_with_metadatas_and_filter() -> None:
texts = ["foo", "foo", "fou", "foy"]