feat(core): add id field to Document passed to filter for InMemoryVectorStore similarity search (#32688)

Added an id field to the Document passed to filter for InMemoryVectorStore similarity search. This allows filtering by Document id and brings the input to the filter in line with the result returned by the vector similarity search. --------- Co-authored-by: Eugene Yurtsev <eyurtsev@gmail.com>
2025-09-11 16:01:33 +00:00 · 2025-09-08 22:39:18 +02:00
parent 97dd7628d2
commit 33c7f230e0
2 changed files with 39 additions and 1 deletions
--- a/libs/core/langchain_core/vectorstores/in_memory.py
+++ b/libs/core/langchain_core/vectorstores/in_memory.py
@@ -376,7 +376,11 @@ class InMemoryVectorStore(VectorStore):
            docs = [
                doc
                for doc in docs
-                if filter(Document(page_content=doc["text"], metadata=doc["metadata"]))
+                if filter(
+                    Document(
+                        id=doc["id"], page_content=doc["text"], metadata=doc["metadata"]
+                    )
+                )
            ]

        if not docs:
--- a/libs/core/tests/unit_tests/vectorstores/test_in_memory.py
+++ b/libs/core/tests/unit_tests/vectorstores/test_in_memory.py
@@ -117,6 +117,40 @@ async def test_inmemory_filter() -> None:
    assert output == []


+async def test_inmemory_filter_by_document_id() -> None:
+    """Test filtering by document ID field."""
+    embedding = DeterministicFakeEmbedding(size=6)
+    store = InMemoryVectorStore(embedding=embedding)
+
+    # Add documents with specific IDs using add_documents
+    documents = [
+        Document(page_content="first document", id="doc_1"),
+        Document(page_content="second document", id="doc_2"),
+        Document(page_content="third document", id="doc_3"),
+    ]
+    store.add_documents(documents)
+
+    # Test filtering by specific document ID
+    output = store.similarity_search("document", filter=lambda doc: doc.id == "doc_2")
+    assert len(output) == 1
+    assert output[0].page_content == "second document"
+    assert output[0].id == "doc_2"
+
+    # Test async version
+    output = await store.asimilarity_search(
+        "document", filter=lambda doc: doc.id in ["doc_1", "doc_3"]
+    )
+    assert len(output) == 2
+    ids = {doc.id for doc in output}
+    assert ids == {"doc_1", "doc_3"}
+
+    # Test filtering with non-existent ID
+    output = store.similarity_search(
+        "document", filter=lambda doc: doc.id == "non_existent"
+    )
+    assert output == []
+
+
 async def test_inmemory_upsert() -> None:
    """Test upsert documents."""
    embedding = DeterministicFakeEmbedding(size=2)