mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-12 06:13:36 +00:00
community[minor]: FAISS Filter Function Enhancement with Advanced Query Operators (#28207)
## Description We are submitting as a team of four for a project. Other team members are @RuofanChen03, @LikeWang10067, @TANYAL77. This pull requests expands the filtering capabilities of the FAISS vectorstore by adding MongoDB-style query operators indicated as follows, while including comprehensive testing for the added functionality. - $eq (equals) - $neq (not equals) - $gt (greater than) - $lt (less than) - $gte (greater than or equal) - $lte (less than or equal) - $in (membership in list) - $nin (not in list) - $and (all conditions must match) - $or (any condition must match) - $not (negation of condition) ## Issue This closes https://github.com/langchain-ai/langchain/issues/26379. ## Sample Usage ```python import faiss import asyncio from langchain_community.vectorstores import FAISS from langchain.schema import Document from langchain_huggingface import HuggingFaceEmbeddings embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") documents = [ Document(page_content="Process customer refund request", metadata={"schema_type": "financial", "handler_type": "refund",}), Document(page_content="Update customer shipping address", metadata={"schema_type": "customer", "handler_type": "update",}), Document(page_content="Process payment transaction", metadata={"schema_type": "financial", "handler_type": "payment",}), Document(page_content="Handle customer complaint", metadata={"schema_type": "customer","handler_type": "complaint",}), Document(page_content="Process invoice payment", metadata={"schema_type": "financial","handler_type": "payment",}) ] async def search(vectorstore, query, schema_type, handler_type, k=2): schema_filter = {"schema_type": {"$eq": schema_type}} handler_filter = {"handler_type": {"$eq": handler_type}} combined_filter = { "$and": [ schema_filter, handler_filter, ] } base_retriever = vectorstore.as_retriever( search_kwargs={"k":k, "filter":combined_filter} ) return await base_retriever.ainvoke(query) async def main(): vectorstore = FAISS.from_texts( texts=[doc.page_content for doc in documents], embedding=embeddings, metadatas=[doc.metadata for doc in documents] ) def printt(title, documents): print(title) if not documents: print("\tNo documents found.") return for doc in documents: print(f"\t{doc.page_content}. {doc.metadata}") printt("Documents:", documents) printt('\nquery="process payment", schema_type="financial", handler_type="payment":', await search(vectorstore, query="process payment", schema_type="financial", handler_type="payment", k=2)) printt('\nquery="customer update", schema_type="customer", handler_type="update":', await search(vectorstore, query="customer update", schema_type="customer", handler_type="update", k=2)) printt('\nquery="refund process", schema_type="financial", handler_type="refund":', await search(vectorstore, query="refund process", schema_type="financial", handler_type="refund", k=2)) printt('\nquery="refund process", schema_type="financial", handler_type="foobar":', await search(vectorstore, query="refund process", schema_type="financial", handler_type="foobar", k=2)) print() if __name__ == "__main__":asyncio.run(main()) ``` ## Output ``` Documents: Process customer refund request. {'schema_type': 'financial', 'handler_type': 'refund'} Update customer shipping address. {'schema_type': 'customer', 'handler_type': 'update'} Process payment transaction. {'schema_type': 'financial', 'handler_type': 'payment'} Handle customer complaint. {'schema_type': 'customer', 'handler_type': 'complaint'} Process invoice payment. {'schema_type': 'financial', 'handler_type': 'payment'} query="process payment", schema_type="financial", handler_type="payment": Process payment transaction. {'schema_type': 'financial', 'handler_type': 'payment'} Process invoice payment. {'schema_type': 'financial', 'handler_type': 'payment'} query="customer update", schema_type="customer", handler_type="update": Update customer shipping address. {'schema_type': 'customer', 'handler_type': 'update'} query="refund process", schema_type="financial", handler_type="refund": Process customer refund request. {'schema_type': 'financial', 'handler_type': 'refund'} query="refund process", schema_type="financial", handler_type="foobar": No documents found. ``` --------- Co-authored-by: ruofan chen <ruofan.is.awesome@gmail.com> Co-authored-by: RickyCowboy <like.wang@mail.utoronto.ca> Co-authored-by: Shanni Li <tanya.li@mail.utoronto.ca> Co-authored-by: RuofanChen03 <114096642+ruofanchen03@users.noreply.github.com> Co-authored-by: Like Wang <102838708+likewang10067@users.noreply.github.com>
This commit is contained in:
parent
b9dd4f2985
commit
df5008fe55
@ -1346,8 +1346,11 @@ class FAISS(VectorStore):
|
||||
conditions for documents.
|
||||
|
||||
Returns:
|
||||
Callable[[Dict[str, Any]], bool]: A function that takes Document's metadata
|
||||
and returns True if it satisfies the filter conditions, otherwise False.
|
||||
A function that takes Document's metadata and returns True if it
|
||||
satisfies the filter conditions, otherwise False.
|
||||
|
||||
Raises:
|
||||
ValueError: If the filter is invalid or contains unsuported operators.
|
||||
"""
|
||||
if callable(filter):
|
||||
return filter
|
||||
@ -1357,12 +1360,118 @@ class FAISS(VectorStore):
|
||||
f"filter must be a dict of metadata or a callable, not {type(filter)}"
|
||||
)
|
||||
|
||||
def filter_func(metadata: Dict[str, Any]) -> bool:
|
||||
return all(
|
||||
metadata.get(key) in value
|
||||
if isinstance(value, list)
|
||||
else metadata.get(key) == value
|
||||
for key, value in filter.items() # type: ignore
|
||||
)
|
||||
from operator import eq, ge, gt, le, lt, ne
|
||||
|
||||
return filter_func
|
||||
COMPARISON_OPERATORS = {
|
||||
"$eq": eq,
|
||||
"$neq": ne,
|
||||
"$gt": gt,
|
||||
"$lt": lt,
|
||||
"$gte": ge,
|
||||
"$lte": le,
|
||||
}
|
||||
SEQUENCE_OPERATORS = {
|
||||
"$in": lambda a, b: a in b,
|
||||
"$nin": lambda a, b: a not in b,
|
||||
}
|
||||
OPERATIONS = COMPARISON_OPERATORS | SEQUENCE_OPERATORS
|
||||
VALID_OPERATORS = frozenset(list(OPERATIONS) + ["$and", "$or", "$not"])
|
||||
SET_CONVERT_THRESHOLD = 10
|
||||
|
||||
# Validate top-level filter operators.
|
||||
for op in filter:
|
||||
if op and op.startswith("$") and op not in VALID_OPERATORS:
|
||||
raise ValueError(f"filter contains unsupported operator: {op}")
|
||||
|
||||
def filter_func_cond(
|
||||
field: str, condition: Union[Dict[str, Any], List[Any], Any]
|
||||
) -> Callable[[Dict[str, Any]], bool]:
|
||||
"""
|
||||
Creates a filter function based on field and condition.
|
||||
|
||||
Args:
|
||||
field: The document field to filter on
|
||||
condition: Filter condition (dict for operators, list for in,
|
||||
or direct value for equality)
|
||||
|
||||
Returns:
|
||||
A filter function that takes a document and returns boolean
|
||||
"""
|
||||
if isinstance(condition, dict):
|
||||
operators = []
|
||||
for op, value in condition.items():
|
||||
if op not in OPERATIONS:
|
||||
raise ValueError(f"filter contains unsupported operator: {op}")
|
||||
operators.append((OPERATIONS[op], value))
|
||||
|
||||
def filter_fn(doc: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Evaluates a document against a set of predefined operators
|
||||
and their values. This function applies multiple
|
||||
comparison/sequence operators to a specific field value
|
||||
from the document. All conditions must be satisfied for the
|
||||
function to return True.
|
||||
|
||||
Args:
|
||||
doc (Dict[str, Any]): The document to evaluate, containing
|
||||
key-value pairs where keys are field names and values
|
||||
are the field values. The document must contain the field
|
||||
being filtered.
|
||||
|
||||
Returns:
|
||||
bool: True if the document's field value satisfies all
|
||||
operator conditions, False otherwise.
|
||||
"""
|
||||
doc_value = doc.get(field)
|
||||
return all(op(doc_value, value) for op, value in operators)
|
||||
|
||||
return filter_fn
|
||||
|
||||
if isinstance(condition, list):
|
||||
if len(condition) > SET_CONVERT_THRESHOLD:
|
||||
condition_set = frozenset(condition)
|
||||
return lambda doc: doc.get(field) in condition_set
|
||||
return lambda doc: doc.get(field) in condition
|
||||
|
||||
return lambda doc: doc.get(field) == condition
|
||||
|
||||
def filter_func(filter: Dict[str, Any]) -> Callable[[Dict[str, Any]], bool]:
|
||||
"""
|
||||
Creates a filter function that evaluates documents against specified
|
||||
filter conditions.
|
||||
|
||||
This function processes a dictionary of filter conditions and returns
|
||||
a callable that can evaluate documents against these conditions. It
|
||||
supports logical operators ($and, $or, $not) and field-level filtering.
|
||||
|
||||
Args:
|
||||
filter (Dict[str, Any]): A dictionary containing filter conditions.
|
||||
Can include:
|
||||
- Logical operators ($and, $or, $not) with lists of sub-filters
|
||||
- Field-level conditions with comparison or sequence operators
|
||||
- Direct field-value mappings for equality comparison
|
||||
|
||||
Returns:
|
||||
Callable[[Dict[str, Any]], bool]: A function that takes a document
|
||||
(as a dictionary) and returns True if the document matches all
|
||||
filter conditions, False otherwise.
|
||||
"""
|
||||
if "$and" in filter:
|
||||
filters = [filter_func(sub_filter) for sub_filter in filter["$and"]]
|
||||
return lambda doc: all(f(doc) for f in filters)
|
||||
|
||||
if "$or" in filter:
|
||||
filters = [filter_func(sub_filter) for sub_filter in filter["$or"]]
|
||||
return lambda doc: any(f(doc) for f in filters)
|
||||
|
||||
if "$not" in filter:
|
||||
cond = filter_func(filter["$not"])
|
||||
return lambda doc: not cond(doc)
|
||||
|
||||
conditions = [
|
||||
filter_func_cond(field, condition)
|
||||
for field, condition in filter.items()
|
||||
]
|
||||
return lambda doc: all(condition(doc) for condition in conditions)
|
||||
|
||||
return filter_func(filter)
|
||||
|
@ -322,6 +322,808 @@ def test_faiss_mmr_with_metadatas_and_filter() -> None:
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_eq() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$eq": 1}}
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[0][1] == 0.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] == 1
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_neq() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$neq": 1}}
|
||||
)
|
||||
assert len(output) == 3
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] != Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[2][0] != Document(page_content="foo", metadata={"page": 0})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] != 1
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_gt() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$gt": 0}}
|
||||
)
|
||||
assert len(output) == 3
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] != Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[2][0] != Document(page_content="foo", metadata={"page": 1})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] > 0
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_lt() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$lt": 2}}
|
||||
)
|
||||
assert len(output) == 2
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] == Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[1][1] == 1.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] < 2
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_gte() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$gte": 1}}
|
||||
)
|
||||
assert len(output) == 3
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] != Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[2][0] != Document(page_content="foo", metadata={"page": 1})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] >= 1
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_lte() -> None:
|
||||
texts = ["fou", "fou", "fouu", "fouuu"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$lte": 0}}
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="fou", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] <= 0
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_in_1() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$in": [0]}}
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] in [0]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_in_2() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$in": [1, 2]}}
|
||||
)
|
||||
assert len(output) == 2
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] == Document(page_content="fou", metadata={"page": 2})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] in [1, 2]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_nin_1() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$nin": [0, 1]}}
|
||||
)
|
||||
assert len(output) == 2
|
||||
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] not in [0, 1]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_nin_2() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$nin": [0, 1, 2]}}
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="foy", metadata={"page": 3})
|
||||
assert output[0][1] == 0.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] not in [0, 1, 2]
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_not() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$not": {"page": 1}}
|
||||
)
|
||||
assert len(output) == 3
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||
assert output[2][0] == Document(page_content="fou", metadata={"page": 2})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: not di["page"] == 1
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_1() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$or": [{"page": 0}]}
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: (di["page"] == 0)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_2() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$or": [{"page": 0}, {"page": 1}]}
|
||||
)
|
||||
assert len(output) == 2
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] == Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[1][1] == 1.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] == 0) or (di["page"] == 1),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_3() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={"$or": [{"page": 0}, {"page": 1}, {"page": 2}]},
|
||||
)
|
||||
assert len(output) == 3
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] != Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[2][0] != Document(page_content="foo", metadata={"page": 0})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] == 0) or (di["page"] == 1) or (di["page"] == 2),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_1() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$and": [{"page": 0}]}
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: (di["page"] == 0)
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_2() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$and": [{"page": 0}, {"page": 1}]}
|
||||
)
|
||||
assert len(output) == 0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] == 0) and (di["page"] == 1),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_3() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={"$and": [{"page": 0}, {"page": 1}, {"page": 2}]},
|
||||
)
|
||||
assert len(output) == 0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] == 0) and (di["page"] == 1) and (di["page"] == 2),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_4() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={"$and": [{"page": 0}, {"page": 0}, {"page": 0}]},
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] == 0) and (di["page"] == 0) and (di["page"] == 0),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_1() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={"$and": [{"$or": [{"page": 1}, {"page": 2}]}, {"$not": {"page": 1}}]},
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] == 1 or di["page"] == 2)
|
||||
and (not di["page"] == 1),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_2() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"$and": [
|
||||
{"$or": [{"page": 1}, {"page": 2}]},
|
||||
{"$or": [{"page": 3}, {"page": 2}, {"page": 0}]},
|
||||
]
|
||||
},
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] == 1 or di["page"] == 2)
|
||||
and (di["page"] == 3 or di["page"] == 2 or di["page"] == 0),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_3() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"$or": [
|
||||
{"$and": [{"page": 1}, {"page": 2}]},
|
||||
{"$and": [{"page": 0}, {"page": 2}]},
|
||||
]
|
||||
},
|
||||
)
|
||||
assert len(output) == 0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] == 1 and di["page"] == 2)
|
||||
or (di["page"] == 0 and di["page"] == 2),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_comparsion_operators_filter_1() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
|
||||
)
|
||||
assert len(output) == 2
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: di["page"] < 1 or di["page"] > 2,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_logical_comparsion_operators_filter_2() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$not": {"page": {"$lt": 1}}}
|
||||
)
|
||||
assert len(output) == 3
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||
assert output[2][0] == Document(page_content="fou", metadata={"page": 2})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: not di["page"] < 1
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_1() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"$and": [
|
||||
{"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
|
||||
{"$or": [{"page": {"$eq": 0}}, {"page": {"$eq": 1}}]},
|
||||
]
|
||||
},
|
||||
)
|
||||
assert len(output) == 1
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] < 1 or di["page"] > 2)
|
||||
and (di["page"] == 0 or di["page"] == 1),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_2() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"$and": [
|
||||
{"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
|
||||
{"$not": {"page": {"$in": [0]}}},
|
||||
{"page": {"$neq": 3}},
|
||||
]
|
||||
},
|
||||
)
|
||||
assert len(output) == 0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] < 1 or di["page"] > 2)
|
||||
and (di["page"] not in [0])
|
||||
and (di["page"] != 3),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_3() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"$or": [
|
||||
{"$and": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
|
||||
{"$not": {"page": {"$nin": [0]}}},
|
||||
{"page": {"$eq": 3}},
|
||||
]
|
||||
},
|
||||
)
|
||||
assert len(output) == 2
|
||||
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||
assert output[0][1] == 0.0
|
||||
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (di["page"] < 1 and di["page"] > 2)
|
||||
or (not di["page"] not in [0])
|
||||
or (di["page"] == 3),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_empty_conditions() -> None:
|
||||
"""Test with an empty filter condition."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={}
|
||||
)
|
||||
|
||||
assert len(output) == 3
|
||||
assert all(doc[0].page_content in ["foo", "bar", "baz"] for doc in output)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadates_and_empty_and_operator() -> None:
|
||||
"""Test with an empty $and operator."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using an empty $and filter
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$and": []}
|
||||
)
|
||||
|
||||
assert len(output) == 3
|
||||
assert all(doc[0].page_content in ["foo", "bar", "baz"] for doc in output)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_empty_or_operator() -> None:
|
||||
"""Test with an empty $or operator."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using an empty $or filter
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$or": []}
|
||||
)
|
||||
|
||||
assert len(output) == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_nonexistent_field() -> None:
|
||||
"""Test with a non-existent field in the metadata."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using a filter with a non-existent field
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"nonexistent_field": {"$eq": 1}}
|
||||
)
|
||||
|
||||
assert len(output) == 0 # Expecting no documents to match
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_invalid_logical_operator() -> None:
|
||||
"""Test with an invalid logical operator key."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using a filter with an invalid logical operator
|
||||
with pytest.raises(ValueError, match="unsupported operator"):
|
||||
docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"$unknown": [{"page": 1}]}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_invalid_comparison_operator() -> None:
|
||||
"""Test with an invalid comparison operator key."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using a filter with an invalid comparison operator
|
||||
with pytest.raises(ValueError, match="unsupported operator"):
|
||||
docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$invalid_operator": 1}}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_valid_invalid_fields() -> None:
|
||||
"""Test with logical operators combining valid and invalid field."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using a filter with $and combining valid and invalid fields
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"$and": [
|
||||
{"page": {"$eq": 1}}, # Valid field
|
||||
{"invalid_field": {"$eq": 1}}, # Invalid field
|
||||
]
|
||||
},
|
||||
)
|
||||
# Expecting no documents to match due to the invalid field
|
||||
assert len(output) == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_valid_and_invalid_operators() -> None:
|
||||
"""Test with logical operators combining valid and invalid operators."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using a filter with $and combining valid and invalid operators
|
||||
with pytest.raises(ValueError, match="unsupported operator"):
|
||||
docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"$and": [
|
||||
{"page": {"$eq": 1}}, # Valid condition
|
||||
{"page": {"$unknown": 2}}, # Invalid operator
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_multiple_nested_logical_operators() -> None:
|
||||
"""Test with multiple nested logical operators."""
|
||||
texts = ["foo", "bar", "baz", "qux", "quux"]
|
||||
metadatas = [
|
||||
{"page": 1, "chapter": 1, "section": 3},
|
||||
{"page": 2, "chapter": 2, "section": 4},
|
||||
{"page": 1, "chapter": 3, "section": 6},
|
||||
{"page": 3, "chapter": 2, "section": 5},
|
||||
{"page": 4, "chapter": 1, "section": 2},
|
||||
]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using a filter with multiple nested logical operators
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"$and": [
|
||||
{"$or": [{"page": {"$eq": 1}}, {"chapter": {"$gt": 2}}]},
|
||||
{"$not": {"section": {"$lte": 5}}},
|
||||
]
|
||||
},
|
||||
)
|
||||
assert len(output) > 0
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (
|
||||
(di["page"] == 1 or di["chapter"] > 2) and di["section"] > 5
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_mixed_data_types() -> None:
|
||||
"""Test with metadata containing mixed data types (numbers, strings, booleans)."""
|
||||
texts = ["foo", "bar", "baz", "qux", "quux"]
|
||||
metadatas: list[dict] = [
|
||||
{"page": "1", "isActive": True, "priority": 2.5},
|
||||
{"page": 2, "isActive": False, "priority": 3.0},
|
||||
{"page": 3, "isActive": True, "priority": 1.5},
|
||||
{"page": 1, "isActive": True, "priority": 4.0},
|
||||
{"page": 4, "isActive": False, "priority": 2.0},
|
||||
]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
# Using a filter with mixed data types
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={
|
||||
"page": {"$eq": "1"}, # String comparison
|
||||
"isActive": {"$eq": True}, # Boolean comparison
|
||||
"priority": {"$gt": 2.0}, # Numeric comparison
|
||||
},
|
||||
)
|
||||
# Assert output matches expected results based on the filter conditions
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter=lambda di: (
|
||||
di["page"] == "1" and di["isActive"] is True and di["priority"] > 2.0
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_conflicting_conditions() -> None:
|
||||
"""Test with conflicting conditions in filters."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec,
|
||||
k=10,
|
||||
lambda_mult=0.1,
|
||||
filter={"$and": [{"page": {"$eq": 1}}, {"page": {"$eq": 2}}]},
|
||||
)
|
||||
# Assert that the output is empty due to conflicting conditions
|
||||
assert len(output) == 0
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
def test_faiss_mmr_with_metadatas_and_null_field_values() -> None:
|
||||
"""Test with fields that have null or undefined values."""
|
||||
texts = ["foo", "bar", "baz", "qux"]
|
||||
metadatas: list[dict] = [{"page": 1}, {"page": None}, {"page": 2}, {"page": None}]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||
# Using a filter to find documents where page is null
|
||||
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$eq": None}}
|
||||
)
|
||||
assert len(output) == 2
|
||||
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] is None
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.requires("faiss")
|
||||
async def test_faiss_async_mmr_with_metadatas_and_filter() -> None:
|
||||
texts = ["foo", "foo", "fou", "foy"]
|
||||
|
Loading…
Reference in New Issue
Block a user