mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 16:43:35 +00:00
feat: faiss filter from list (#6537)
### Feature Using FAISS on a retrievalQA task, I found myself wanting to allow in multiple sources. From what I understood, the filter feature takes in a dict of form {key: value} which then will check in the metadata for the exact value linked to that key. I added some logic to be able to pass a list which will be checked against instead of an exact value. Passing an exact value will also work. Here's an example of how I could then use it in my own project: ``` pdfs_to_filter_in = ["file_A", "file_B"] filter_dict = { "source": [f"source_pdfs/{pdf_name}.pdf" for pdf_name in pdfs_to_filter_in] } retriever = db.as_retriever() retriever.search_kwargs = {"filter": filter_dict} ``` I added an integration test based on the other ones I found in `tests/integration_tests/vectorstores/test_faiss.py` under `test_faiss_with_metadatas_and_list_filter()`. It doesn't feel like this is worthy of its own notebook or doc, but I'm open to suggestions if needed. Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
parent
00a7403236
commit
e0605b464b
@ -192,7 +192,7 @@ class FAISS(VectorStore):
|
||||
Args:
|
||||
embedding: Embedding vector to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
||||
filter (Optional[Dict[str, Any]]): Filter by metadata. Defaults to None.
|
||||
fetch_k: (Optional[int]) Number of Documents to fetch before filtering.
|
||||
Defaults to 20.
|
||||
**kwargs: kwargs to be passed to similarity search. Can include:
|
||||
@ -218,7 +218,11 @@ class FAISS(VectorStore):
|
||||
if not isinstance(doc, Document):
|
||||
raise ValueError(f"Could not find document for id {_id}, got {doc}")
|
||||
if filter is not None:
|
||||
if all(doc.metadata.get(key) == value for key, value in filter.items()):
|
||||
filter = {
|
||||
key: [value] if not isinstance(value, list) else value
|
||||
for key, value in filter.items()
|
||||
}
|
||||
if all(doc.metadata.get(key) in value for key, value in filter.items()):
|
||||
docs.append((doc, scores[0][j]))
|
||||
else:
|
||||
docs.append((doc, scores[0][j]))
|
||||
|
@ -96,6 +96,34 @@ def test_faiss_with_metadatas_and_filter() -> None:
|
||||
assert output == [Document(page_content="bar", metadata={"page": 1})]
|
||||
|
||||
|
||||
def test_faiss_with_metadatas_and_list_filter() -> None:
|
||||
texts = ["foo", "bar", "baz", "foo", "qux"]
|
||||
metadatas = [{"page": i} if i <= 3 else {"page": 3} for i in range(len(texts))]
|
||||
docsearch = FAISS.from_texts(texts, FakeEmbeddings(), metadatas=metadatas)
|
||||
expected_docstore = InMemoryDocstore(
|
||||
{
|
||||
docsearch.index_to_docstore_id[0]: Document(
|
||||
page_content="foo", metadata={"page": 0}
|
||||
),
|
||||
docsearch.index_to_docstore_id[1]: Document(
|
||||
page_content="bar", metadata={"page": 1}
|
||||
),
|
||||
docsearch.index_to_docstore_id[2]: Document(
|
||||
page_content="baz", metadata={"page": 2}
|
||||
),
|
||||
docsearch.index_to_docstore_id[3]: Document(
|
||||
page_content="foo", metadata={"page": 3}
|
||||
),
|
||||
docsearch.index_to_docstore_id[4]: Document(
|
||||
page_content="qux", metadata={"page": 3}
|
||||
),
|
||||
}
|
||||
)
|
||||
assert docsearch.docstore.__dict__ == expected_docstore.__dict__
|
||||
output = docsearch.similarity_search("foor", k=1, filter={"page": [0, 1, 2]})
|
||||
assert output == [Document(page_content="foo", metadata={"page": 0})]
|
||||
|
||||
|
||||
def test_faiss_search_not_found() -> None:
|
||||
"""Test what happens when document is not found."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
|
Loading…
Reference in New Issue
Block a user