From c5a7a85a4e6cd307f83b2e455d466722d75940b2 Mon Sep 17 00:00:00 2001 From: Caleb Ellington Date: Fri, 2 Jun 2023 14:12:48 -0400 Subject: [PATCH] fix chroma update_document to embed entire documents, fixes a characer-wise embedding bug (#5584) # Chroma update_document full document embeddings bugfix Chroma update_document takes a single document, but treats the page_content sting of that document as a list when getting the new document embedding. This is a two-fold problem, where the resulting embedding for the updated document is incorrect (it's only an embedding of the first character in the new page_content) and it calls the embedding function for every character in the new page_content string, using many tokens in the process. Fixes #5582 Co-authored-by: Caleb Ellington --- langchain/vectorstores/chroma.py | 4 ++-- .../vectorstores/test_chroma.py | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py index 39577b0cf51..646a1508b2d 100644 --- a/langchain/vectorstores/chroma.py +++ b/langchain/vectorstores/chroma.py @@ -356,11 +356,11 @@ class Chroma(VectorStore): raise ValueError( "For update, you must specify an embedding function on creation." ) - embeddings = self._embedding_function.embed_documents(list(text)) + embeddings = self._embedding_function.embed_documents([text]) self._collection.update( ids=[document_id], - embeddings=[embeddings[0]], + embeddings=embeddings, documents=[text], metadatas=[metadata], ) diff --git a/tests/integration_tests/vectorstores/test_chroma.py b/tests/integration_tests/vectorstores/test_chroma.py index cc594d2ef90..652f8bcb3e6 100644 --- a/tests/integration_tests/vectorstores/test_chroma.py +++ b/tests/integration_tests/vectorstores/test_chroma.py @@ -3,7 +3,10 @@ import pytest from langchain.docstore.document import Document from langchain.vectorstores import Chroma -from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings +from tests.integration_tests.vectorstores.fake_embeddings import ( + ConsistentFakeEmbeddings, + FakeEmbeddings, +) def test_chroma() -> None: @@ -164,6 +167,8 @@ def test_chroma_with_include_parameter() -> None: def test_chroma_update_document() -> None: """Test the update_document function in the Chroma class.""" + # Make a consistent embedding + embedding = ConsistentFakeEmbeddings() # Initial document content and id initial_content = "foo" @@ -176,9 +181,12 @@ def test_chroma_update_document() -> None: docsearch = Chroma.from_documents( collection_name="test_collection", documents=[original_doc], - embedding=FakeEmbeddings(), + embedding=embedding, ids=[document_id], ) + old_embedding = docsearch._collection.peek()["embeddings"][ + docsearch._collection.peek()["ids"].index(document_id) + ] # Define updated content for the document updated_content = "updated foo" @@ -194,3 +202,10 @@ def test_chroma_update_document() -> None: # Assert that the updated document is returned by the search assert output == [Document(page_content=updated_content, metadata={"page": "0"})] + + # Assert that the new embedding is correct + new_embedding = docsearch._collection.peek()["embeddings"][ + docsearch._collection.peek()["ids"].index(document_id) + ] + assert new_embedding == embedding.embed_documents([updated_content])[0] + assert new_embedding != old_embedding