From c5a7a85a4e6cd307f83b2e455d466722d75940b2 Mon Sep 17 00:00:00 2001
From: Caleb Ellington <caleb.n.ellington@gmail.com>
Date: Fri, 2 Jun 2023 14:12:48 -0400
Subject: [PATCH] fix chroma update_document to embed entire documents, fixes a
 characer-wise embedding bug (#5584)

# Chroma update_document full document embeddings bugfix

Chroma update_document takes a single document, but treats the
page_content sting of that document as a list when getting the new
document embedding.

This is a two-fold problem, where the resulting embedding for the
updated document is incorrect (it's only an embedding of the first
character in the new page_content) and it calls the embedding function
for every character in the new page_content string, using many tokens in
the process.

Fixes #5582


Co-authored-by: Caleb Ellington <calebellington@Calebs-MBP.hsd1.ca.comcast.net>
---
 langchain/vectorstores/chroma.py              |  4 ++--
 .../vectorstores/test_chroma.py               | 19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/langchain/vectorstores/chroma.py b/langchain/vectorstores/chroma.py
index 39577b0cf51..646a1508b2d 100644
--- a/langchain/vectorstores/chroma.py
+++ b/langchain/vectorstores/chroma.py
@@ -356,11 +356,11 @@ class Chroma(VectorStore):
             raise ValueError(
                 "For update, you must specify an embedding function on creation."
             )
-        embeddings = self._embedding_function.embed_documents(list(text))
+        embeddings = self._embedding_function.embed_documents([text])
 
         self._collection.update(
             ids=[document_id],
-            embeddings=[embeddings[0]],
+            embeddings=embeddings,
             documents=[text],
             metadatas=[metadata],
         )
diff --git a/tests/integration_tests/vectorstores/test_chroma.py b/tests/integration_tests/vectorstores/test_chroma.py
index cc594d2ef90..652f8bcb3e6 100644
--- a/tests/integration_tests/vectorstores/test_chroma.py
+++ b/tests/integration_tests/vectorstores/test_chroma.py
@@ -3,7 +3,10 @@ import pytest
 
 from langchain.docstore.document import Document
 from langchain.vectorstores import Chroma
-from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
+from tests.integration_tests.vectorstores.fake_embeddings import (
+    ConsistentFakeEmbeddings,
+    FakeEmbeddings,
+)
 
 
 def test_chroma() -> None:
@@ -164,6 +167,8 @@ def test_chroma_with_include_parameter() -> None:
 
 def test_chroma_update_document() -> None:
     """Test the update_document function in the Chroma class."""
+    # Make a consistent embedding
+    embedding = ConsistentFakeEmbeddings()
 
     # Initial document content and id
     initial_content = "foo"
@@ -176,9 +181,12 @@ def test_chroma_update_document() -> None:
     docsearch = Chroma.from_documents(
         collection_name="test_collection",
         documents=[original_doc],
-        embedding=FakeEmbeddings(),
+        embedding=embedding,
         ids=[document_id],
     )
+    old_embedding = docsearch._collection.peek()["embeddings"][
+        docsearch._collection.peek()["ids"].index(document_id)
+    ]
 
     # Define updated content for the document
     updated_content = "updated foo"
@@ -194,3 +202,10 @@ def test_chroma_update_document() -> None:
 
     # Assert that the updated document is returned by the search
     assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
+
+    # Assert that the new embedding is correct
+    new_embedding = docsearch._collection.peek()["embeddings"][
+        docsearch._collection.peek()["ids"].index(document_id)
+    ]
+    assert new_embedding == embedding.embed_documents([updated_content])[0]
+    assert new_embedding != old_embedding