mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-13 14:50:00 +00:00
community[minor]: FAISS Filter Function Enhancement with Advanced Query Operators (#28207)
## Description We are submitting as a team of four for a project. Other team members are @RuofanChen03, @LikeWang10067, @TANYAL77. This pull requests expands the filtering capabilities of the FAISS vectorstore by adding MongoDB-style query operators indicated as follows, while including comprehensive testing for the added functionality. - $eq (equals) - $neq (not equals) - $gt (greater than) - $lt (less than) - $gte (greater than or equal) - $lte (less than or equal) - $in (membership in list) - $nin (not in list) - $and (all conditions must match) - $or (any condition must match) - $not (negation of condition) ## Issue This closes https://github.com/langchain-ai/langchain/issues/26379. ## Sample Usage ```python import faiss import asyncio from langchain_community.vectorstores import FAISS from langchain.schema import Document from langchain_huggingface import HuggingFaceEmbeddings embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2") documents = [ Document(page_content="Process customer refund request", metadata={"schema_type": "financial", "handler_type": "refund",}), Document(page_content="Update customer shipping address", metadata={"schema_type": "customer", "handler_type": "update",}), Document(page_content="Process payment transaction", metadata={"schema_type": "financial", "handler_type": "payment",}), Document(page_content="Handle customer complaint", metadata={"schema_type": "customer","handler_type": "complaint",}), Document(page_content="Process invoice payment", metadata={"schema_type": "financial","handler_type": "payment",}) ] async def search(vectorstore, query, schema_type, handler_type, k=2): schema_filter = {"schema_type": {"$eq": schema_type}} handler_filter = {"handler_type": {"$eq": handler_type}} combined_filter = { "$and": [ schema_filter, handler_filter, ] } base_retriever = vectorstore.as_retriever( search_kwargs={"k":k, "filter":combined_filter} ) return await base_retriever.ainvoke(query) async def main(): vectorstore = FAISS.from_texts( texts=[doc.page_content for doc in documents], embedding=embeddings, metadatas=[doc.metadata for doc in documents] ) def printt(title, documents): print(title) if not documents: print("\tNo documents found.") return for doc in documents: print(f"\t{doc.page_content}. {doc.metadata}") printt("Documents:", documents) printt('\nquery="process payment", schema_type="financial", handler_type="payment":', await search(vectorstore, query="process payment", schema_type="financial", handler_type="payment", k=2)) printt('\nquery="customer update", schema_type="customer", handler_type="update":', await search(vectorstore, query="customer update", schema_type="customer", handler_type="update", k=2)) printt('\nquery="refund process", schema_type="financial", handler_type="refund":', await search(vectorstore, query="refund process", schema_type="financial", handler_type="refund", k=2)) printt('\nquery="refund process", schema_type="financial", handler_type="foobar":', await search(vectorstore, query="refund process", schema_type="financial", handler_type="foobar", k=2)) print() if __name__ == "__main__":asyncio.run(main()) ``` ## Output ``` Documents: Process customer refund request. {'schema_type': 'financial', 'handler_type': 'refund'} Update customer shipping address. {'schema_type': 'customer', 'handler_type': 'update'} Process payment transaction. {'schema_type': 'financial', 'handler_type': 'payment'} Handle customer complaint. {'schema_type': 'customer', 'handler_type': 'complaint'} Process invoice payment. {'schema_type': 'financial', 'handler_type': 'payment'} query="process payment", schema_type="financial", handler_type="payment": Process payment transaction. {'schema_type': 'financial', 'handler_type': 'payment'} Process invoice payment. {'schema_type': 'financial', 'handler_type': 'payment'} query="customer update", schema_type="customer", handler_type="update": Update customer shipping address. {'schema_type': 'customer', 'handler_type': 'update'} query="refund process", schema_type="financial", handler_type="refund": Process customer refund request. {'schema_type': 'financial', 'handler_type': 'refund'} query="refund process", schema_type="financial", handler_type="foobar": No documents found. ``` --------- Co-authored-by: ruofan chen <ruofan.is.awesome@gmail.com> Co-authored-by: RickyCowboy <like.wang@mail.utoronto.ca> Co-authored-by: Shanni Li <tanya.li@mail.utoronto.ca> Co-authored-by: RuofanChen03 <114096642+ruofanchen03@users.noreply.github.com> Co-authored-by: Like Wang <102838708+likewang10067@users.noreply.github.com>
This commit is contained in:
parent
b9dd4f2985
commit
df5008fe55
@ -1346,8 +1346,11 @@ class FAISS(VectorStore):
|
|||||||
conditions for documents.
|
conditions for documents.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Callable[[Dict[str, Any]], bool]: A function that takes Document's metadata
|
A function that takes Document's metadata and returns True if it
|
||||||
and returns True if it satisfies the filter conditions, otherwise False.
|
satisfies the filter conditions, otherwise False.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
ValueError: If the filter is invalid or contains unsuported operators.
|
||||||
"""
|
"""
|
||||||
if callable(filter):
|
if callable(filter):
|
||||||
return filter
|
return filter
|
||||||
@ -1357,12 +1360,118 @@ class FAISS(VectorStore):
|
|||||||
f"filter must be a dict of metadata or a callable, not {type(filter)}"
|
f"filter must be a dict of metadata or a callable, not {type(filter)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
def filter_func(metadata: Dict[str, Any]) -> bool:
|
from operator import eq, ge, gt, le, lt, ne
|
||||||
return all(
|
|
||||||
metadata.get(key) in value
|
|
||||||
if isinstance(value, list)
|
|
||||||
else metadata.get(key) == value
|
|
||||||
for key, value in filter.items() # type: ignore
|
|
||||||
)
|
|
||||||
|
|
||||||
return filter_func
|
COMPARISON_OPERATORS = {
|
||||||
|
"$eq": eq,
|
||||||
|
"$neq": ne,
|
||||||
|
"$gt": gt,
|
||||||
|
"$lt": lt,
|
||||||
|
"$gte": ge,
|
||||||
|
"$lte": le,
|
||||||
|
}
|
||||||
|
SEQUENCE_OPERATORS = {
|
||||||
|
"$in": lambda a, b: a in b,
|
||||||
|
"$nin": lambda a, b: a not in b,
|
||||||
|
}
|
||||||
|
OPERATIONS = COMPARISON_OPERATORS | SEQUENCE_OPERATORS
|
||||||
|
VALID_OPERATORS = frozenset(list(OPERATIONS) + ["$and", "$or", "$not"])
|
||||||
|
SET_CONVERT_THRESHOLD = 10
|
||||||
|
|
||||||
|
# Validate top-level filter operators.
|
||||||
|
for op in filter:
|
||||||
|
if op and op.startswith("$") and op not in VALID_OPERATORS:
|
||||||
|
raise ValueError(f"filter contains unsupported operator: {op}")
|
||||||
|
|
||||||
|
def filter_func_cond(
|
||||||
|
field: str, condition: Union[Dict[str, Any], List[Any], Any]
|
||||||
|
) -> Callable[[Dict[str, Any]], bool]:
|
||||||
|
"""
|
||||||
|
Creates a filter function based on field and condition.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
field: The document field to filter on
|
||||||
|
condition: Filter condition (dict for operators, list for in,
|
||||||
|
or direct value for equality)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A filter function that takes a document and returns boolean
|
||||||
|
"""
|
||||||
|
if isinstance(condition, dict):
|
||||||
|
operators = []
|
||||||
|
for op, value in condition.items():
|
||||||
|
if op not in OPERATIONS:
|
||||||
|
raise ValueError(f"filter contains unsupported operator: {op}")
|
||||||
|
operators.append((OPERATIONS[op], value))
|
||||||
|
|
||||||
|
def filter_fn(doc: Dict[str, Any]) -> bool:
|
||||||
|
"""
|
||||||
|
Evaluates a document against a set of predefined operators
|
||||||
|
and their values. This function applies multiple
|
||||||
|
comparison/sequence operators to a specific field value
|
||||||
|
from the document. All conditions must be satisfied for the
|
||||||
|
function to return True.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
doc (Dict[str, Any]): The document to evaluate, containing
|
||||||
|
key-value pairs where keys are field names and values
|
||||||
|
are the field values. The document must contain the field
|
||||||
|
being filtered.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
bool: True if the document's field value satisfies all
|
||||||
|
operator conditions, False otherwise.
|
||||||
|
"""
|
||||||
|
doc_value = doc.get(field)
|
||||||
|
return all(op(doc_value, value) for op, value in operators)
|
||||||
|
|
||||||
|
return filter_fn
|
||||||
|
|
||||||
|
if isinstance(condition, list):
|
||||||
|
if len(condition) > SET_CONVERT_THRESHOLD:
|
||||||
|
condition_set = frozenset(condition)
|
||||||
|
return lambda doc: doc.get(field) in condition_set
|
||||||
|
return lambda doc: doc.get(field) in condition
|
||||||
|
|
||||||
|
return lambda doc: doc.get(field) == condition
|
||||||
|
|
||||||
|
def filter_func(filter: Dict[str, Any]) -> Callable[[Dict[str, Any]], bool]:
|
||||||
|
"""
|
||||||
|
Creates a filter function that evaluates documents against specified
|
||||||
|
filter conditions.
|
||||||
|
|
||||||
|
This function processes a dictionary of filter conditions and returns
|
||||||
|
a callable that can evaluate documents against these conditions. It
|
||||||
|
supports logical operators ($and, $or, $not) and field-level filtering.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
filter (Dict[str, Any]): A dictionary containing filter conditions.
|
||||||
|
Can include:
|
||||||
|
- Logical operators ($and, $or, $not) with lists of sub-filters
|
||||||
|
- Field-level conditions with comparison or sequence operators
|
||||||
|
- Direct field-value mappings for equality comparison
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Callable[[Dict[str, Any]], bool]: A function that takes a document
|
||||||
|
(as a dictionary) and returns True if the document matches all
|
||||||
|
filter conditions, False otherwise.
|
||||||
|
"""
|
||||||
|
if "$and" in filter:
|
||||||
|
filters = [filter_func(sub_filter) for sub_filter in filter["$and"]]
|
||||||
|
return lambda doc: all(f(doc) for f in filters)
|
||||||
|
|
||||||
|
if "$or" in filter:
|
||||||
|
filters = [filter_func(sub_filter) for sub_filter in filter["$or"]]
|
||||||
|
return lambda doc: any(f(doc) for f in filters)
|
||||||
|
|
||||||
|
if "$not" in filter:
|
||||||
|
cond = filter_func(filter["$not"])
|
||||||
|
return lambda doc: not cond(doc)
|
||||||
|
|
||||||
|
conditions = [
|
||||||
|
filter_func_cond(field, condition)
|
||||||
|
for field, condition in filter.items()
|
||||||
|
]
|
||||||
|
return lambda doc: all(condition(doc) for condition in conditions)
|
||||||
|
|
||||||
|
return filter_func(filter)
|
||||||
|
@ -322,6 +322,808 @@ def test_faiss_mmr_with_metadatas_and_filter() -> None:
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_eq() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$eq": 1}}
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] == 1
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_neq() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$neq": 1}}
|
||||||
|
)
|
||||||
|
assert len(output) == 3
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] != Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[2][0] != Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] != 1
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_gt() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$gt": 0}}
|
||||||
|
)
|
||||||
|
assert len(output) == 3
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] != Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[2][0] != Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] > 0
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_lt() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$lt": 2}}
|
||||||
|
)
|
||||||
|
assert len(output) == 2
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] == Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[1][1] == 1.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] < 2
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_gte() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$gte": 1}}
|
||||||
|
)
|
||||||
|
assert len(output) == 3
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] != Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[2][0] != Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] >= 1
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_lte() -> None:
|
||||||
|
texts = ["fou", "fou", "fouu", "fouuu"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$lte": 0}}
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="fou", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] <= 0
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_in_1() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$in": [0]}}
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] in [0]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_in_2() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$in": [1, 2]}}
|
||||||
|
)
|
||||||
|
assert len(output) == 2
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] == Document(page_content="fou", metadata={"page": 2})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] in [1, 2]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_nin_1() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$nin": [0, 1]}}
|
||||||
|
)
|
||||||
|
assert len(output) == 2
|
||||||
|
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] not in [0, 1]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_comparison_operators_filter_nin_2() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$nin": [0, 1, 2]}}
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="foy", metadata={"page": 3})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] not in [0, 1, 2]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_not() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$not": {"page": 1}}
|
||||||
|
)
|
||||||
|
assert len(output) == 3
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||||
|
assert output[2][0] == Document(page_content="fou", metadata={"page": 2})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: not di["page"] == 1
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_1() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$or": [{"page": 0}]}
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: (di["page"] == 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_2() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$or": [{"page": 0}, {"page": 1}]}
|
||||||
|
)
|
||||||
|
assert len(output) == 2
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] == Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[1][1] == 1.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] == 0) or (di["page"] == 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_or_3() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={"$or": [{"page": 0}, {"page": 1}, {"page": 2}]},
|
||||||
|
)
|
||||||
|
assert len(output) == 3
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] != Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[2][0] != Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] == 0) or (di["page"] == 1) or (di["page"] == 2),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_1() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$and": [{"page": 0}]}
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: (di["page"] == 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_2() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$and": [{"page": 0}, {"page": 1}]}
|
||||||
|
)
|
||||||
|
assert len(output) == 0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] == 0) and (di["page"] == 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_3() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={"$and": [{"page": 0}, {"page": 1}, {"page": 2}]},
|
||||||
|
)
|
||||||
|
assert len(output) == 0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] == 0) and (di["page"] == 1) and (di["page"] == 2),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_operators_filter_and_4() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={"$and": [{"page": 0}, {"page": 0}, {"page": 0}]},
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] == 0) and (di["page"] == 0) and (di["page"] == 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_1() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={"$and": [{"$or": [{"page": 1}, {"page": 2}]}, {"$not": {"page": 1}}]},
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] == 1 or di["page"] == 2)
|
||||||
|
and (not di["page"] == 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_2() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"$and": [
|
||||||
|
{"$or": [{"page": 1}, {"page": 2}]},
|
||||||
|
{"$or": [{"page": 3}, {"page": 2}, {"page": 0}]},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="fou", metadata={"page": 2})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] == 1 or di["page"] == 2)
|
||||||
|
and (di["page"] == 3 or di["page"] == 2 or di["page"] == 0),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_nested_logical_operators_filter_3() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"$or": [
|
||||||
|
{"$and": [{"page": 1}, {"page": 2}]},
|
||||||
|
{"$and": [{"page": 0}, {"page": 2}]},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert len(output) == 0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] == 1 and di["page"] == 2)
|
||||||
|
or (di["page"] == 0 and di["page"] == 2),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_comparsion_operators_filter_1() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
|
||||||
|
)
|
||||||
|
assert len(output) == 2
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: di["page"] < 1 or di["page"] > 2,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_logical_comparsion_operators_filter_2() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$not": {"page": {"$lt": 1}}}
|
||||||
|
)
|
||||||
|
assert len(output) == 3
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 1})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||||
|
assert output[2][0] == Document(page_content="fou", metadata={"page": 2})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: not di["page"] < 1
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_1() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"$and": [
|
||||||
|
{"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
|
||||||
|
{"$or": [{"page": {"$eq": 0}}, {"page": {"$eq": 1}}]},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert len(output) == 1
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] < 1 or di["page"] > 2)
|
||||||
|
and (di["page"] == 0 or di["page"] == 1),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_2() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"$and": [
|
||||||
|
{"$or": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
|
||||||
|
{"$not": {"page": {"$in": [0]}}},
|
||||||
|
{"page": {"$neq": 3}},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert len(output) == 0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] < 1 or di["page"] > 2)
|
||||||
|
and (di["page"] not in [0])
|
||||||
|
and (di["page"] != 3),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_nested_logical_comparsion_ops_filter_3() -> None:
|
||||||
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"$or": [
|
||||||
|
{"$and": [{"page": {"$lt": 1}}, {"page": {"$gt": 2}}]},
|
||||||
|
{"$not": {"page": {"$nin": [0]}}},
|
||||||
|
{"page": {"$eq": 3}},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert len(output) == 2
|
||||||
|
assert output[0][0] == Document(page_content="foo", metadata={"page": 0})
|
||||||
|
assert output[0][1] == 0.0
|
||||||
|
assert output[1][0] == Document(page_content="foy", metadata={"page": 3})
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (di["page"] < 1 and di["page"] > 2)
|
||||||
|
or (not di["page"] not in [0])
|
||||||
|
or (di["page"] == 3),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_empty_conditions() -> None:
|
||||||
|
"""Test with an empty filter condition."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(output) == 3
|
||||||
|
assert all(doc[0].page_content in ["foo", "bar", "baz"] for doc in output)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadates_and_empty_and_operator() -> None:
|
||||||
|
"""Test with an empty $and operator."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using an empty $and filter
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$and": []}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(output) == 3
|
||||||
|
assert all(doc[0].page_content in ["foo", "bar", "baz"] for doc in output)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_empty_or_operator() -> None:
|
||||||
|
"""Test with an empty $or operator."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using an empty $or filter
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$or": []}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(output) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_nonexistent_field() -> None:
|
||||||
|
"""Test with a non-existent field in the metadata."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using a filter with a non-existent field
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"nonexistent_field": {"$eq": 1}}
|
||||||
|
)
|
||||||
|
|
||||||
|
assert len(output) == 0 # Expecting no documents to match
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_invalid_logical_operator() -> None:
|
||||||
|
"""Test with an invalid logical operator key."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using a filter with an invalid logical operator
|
||||||
|
with pytest.raises(ValueError, match="unsupported operator"):
|
||||||
|
docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"$unknown": [{"page": 1}]}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_invalid_comparison_operator() -> None:
|
||||||
|
"""Test with an invalid comparison operator key."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using a filter with an invalid comparison operator
|
||||||
|
with pytest.raises(ValueError, match="unsupported operator"):
|
||||||
|
docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$invalid_operator": 1}}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_valid_invalid_fields() -> None:
|
||||||
|
"""Test with logical operators combining valid and invalid field."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using a filter with $and combining valid and invalid fields
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"$and": [
|
||||||
|
{"page": {"$eq": 1}}, # Valid field
|
||||||
|
{"invalid_field": {"$eq": 1}}, # Invalid field
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# Expecting no documents to match due to the invalid field
|
||||||
|
assert len(output) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_valid_and_invalid_operators() -> None:
|
||||||
|
"""Test with logical operators combining valid and invalid operators."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using a filter with $and combining valid and invalid operators
|
||||||
|
with pytest.raises(ValueError, match="unsupported operator"):
|
||||||
|
docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"$and": [
|
||||||
|
{"page": {"$eq": 1}}, # Valid condition
|
||||||
|
{"page": {"$unknown": 2}}, # Invalid operator
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_multiple_nested_logical_operators() -> None:
|
||||||
|
"""Test with multiple nested logical operators."""
|
||||||
|
texts = ["foo", "bar", "baz", "qux", "quux"]
|
||||||
|
metadatas = [
|
||||||
|
{"page": 1, "chapter": 1, "section": 3},
|
||||||
|
{"page": 2, "chapter": 2, "section": 4},
|
||||||
|
{"page": 1, "chapter": 3, "section": 6},
|
||||||
|
{"page": 3, "chapter": 2, "section": 5},
|
||||||
|
{"page": 4, "chapter": 1, "section": 2},
|
||||||
|
]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using a filter with multiple nested logical operators
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"$and": [
|
||||||
|
{"$or": [{"page": {"$eq": 1}}, {"chapter": {"$gt": 2}}]},
|
||||||
|
{"$not": {"section": {"$lte": 5}}},
|
||||||
|
]
|
||||||
|
},
|
||||||
|
)
|
||||||
|
assert len(output) > 0
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (
|
||||||
|
(di["page"] == 1 or di["chapter"] > 2) and di["section"] > 5
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_mixed_data_types() -> None:
|
||||||
|
"""Test with metadata containing mixed data types (numbers, strings, booleans)."""
|
||||||
|
texts = ["foo", "bar", "baz", "qux", "quux"]
|
||||||
|
metadatas: list[dict] = [
|
||||||
|
{"page": "1", "isActive": True, "priority": 2.5},
|
||||||
|
{"page": 2, "isActive": False, "priority": 3.0},
|
||||||
|
{"page": 3, "isActive": True, "priority": 1.5},
|
||||||
|
{"page": 1, "isActive": True, "priority": 4.0},
|
||||||
|
{"page": 4, "isActive": False, "priority": 2.0},
|
||||||
|
]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
# Using a filter with mixed data types
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={
|
||||||
|
"page": {"$eq": "1"}, # String comparison
|
||||||
|
"isActive": {"$eq": True}, # Boolean comparison
|
||||||
|
"priority": {"$gt": 2.0}, # Numeric comparison
|
||||||
|
},
|
||||||
|
)
|
||||||
|
# Assert output matches expected results based on the filter conditions
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter=lambda di: (
|
||||||
|
di["page"] == "1" and di["isActive"] is True and di["priority"] > 2.0
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_conflicting_conditions() -> None:
|
||||||
|
"""Test with conflicting conditions in filters."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": i} for i in range(len(texts))]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec,
|
||||||
|
k=10,
|
||||||
|
lambda_mult=0.1,
|
||||||
|
filter={"$and": [{"page": {"$eq": 1}}, {"page": {"$eq": 2}}]},
|
||||||
|
)
|
||||||
|
# Assert that the output is empty due to conflicting conditions
|
||||||
|
assert len(output) == 0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.requires("faiss")
|
||||||
|
def test_faiss_mmr_with_metadatas_and_null_field_values() -> None:
|
||||||
|
"""Test with fields that have null or undefined values."""
|
||||||
|
texts = ["foo", "bar", "baz", "qux"]
|
||||||
|
metadatas: list[dict] = [{"page": 1}, {"page": None}, {"page": 2}, {"page": None}]
|
||||||
|
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||||
|
query_vec = FakeEmbeddings().embed_query(text="foo")
|
||||||
|
# Using a filter to find documents where page is null
|
||||||
|
output = docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter={"page": {"$eq": None}}
|
||||||
|
)
|
||||||
|
assert len(output) == 2
|
||||||
|
assert output == docsearch.max_marginal_relevance_search_with_score_by_vector(
|
||||||
|
query_vec, k=10, lambda_mult=0.1, filter=lambda di: di["page"] is None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.requires("faiss")
|
@pytest.mark.requires("faiss")
|
||||||
async def test_faiss_async_mmr_with_metadatas_and_filter() -> None:
|
async def test_faiss_async_mmr_with_metadatas_and_filter() -> None:
|
||||||
texts = ["foo", "foo", "fou", "foy"]
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
|
Loading…
Reference in New Issue
Block a user