community[minor]: Improve InMemoryVectorStore with ability to persist to disk and filter on metadata. (#22186)

- **Description:** The InMemoryVectorStore is a nice and simple vector
store implementation for quick development and debugging. The current
implementation is quite limited in its functionalities. This PR extends
the functionalities by adding utility function to persist the vector
store to a json file and to load it from a json file. We choose the json
file format because it allows inspection of the database contents in a
text editor, which is great for debugging. Furthermore, it adds a
`filter` keyword that can be used to filter out documents on their
`page_content` or `metadata`.
- **Issue:** -
- **Dependencies:** -
- **Twitter handle:** @Vincent_Min
This commit is contained in:
Vincent Min
2024-06-05 16:40:34 +02:00
committed by GitHub
parent c34ad8c163
commit 59bef31997
2 changed files with 91 additions and 30 deletions

View File

@@ -1,3 +1,5 @@
from pathlib import Path
from langchain_core.documents import Document
from langchain_community.vectorstores.inmemory import InMemoryVectorStore
@@ -44,3 +46,31 @@ async def test_inmemory_mmr() -> None:
assert len(output) == len(texts)
assert output[0] == Document(page_content="foo")
assert output[1] == Document(page_content="foy")
async def test_inmemory_dump_load(tmp_path: Path) -> None:
"""Test end to end construction and search."""
embedding = ConsistentFakeEmbeddings()
store = await InMemoryVectorStore.afrom_texts(["foo", "bar", "baz"], embedding)
output = await store.asimilarity_search("foo", k=1)
test_file = str(tmp_path / "test.json")
store.dump(test_file)
loaded_store = InMemoryVectorStore.load(test_file, embedding)
loaded_output = await loaded_store.asimilarity_search("foo", k=1)
assert output == loaded_output
async def test_inmemory_filter() -> None:
"""Test end to end construction and search."""
store = await InMemoryVectorStore.afrom_texts(
["foo", "bar"],
ConsistentFakeEmbeddings(),
[{"id": 1}, {"id": 2}],
)
output = await store.asimilarity_search(
"baz", filter=lambda doc: doc.metadata["id"] == 1
)
assert output == [Document(page_content="foo", metadata={"id": 1})]