mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-20 01:49:51 +00:00
fix chroma update_document to embed entire documents, fixes a characer-wise embedding bug (#5584)
# Chroma update_document full document embeddings bugfix Chroma update_document takes a single document, but treats the page_content sting of that document as a list when getting the new document embedding. This is a two-fold problem, where the resulting embedding for the updated document is incorrect (it's only an embedding of the first character in the new page_content) and it calls the embedding function for every character in the new page_content string, using many tokens in the process. Fixes #5582 Co-authored-by: Caleb Ellington <calebellington@Calebs-MBP.hsd1.ca.comcast.net>
This commit is contained in:
parent
3c6fa9126a
commit
c5a7a85a4e
@ -356,11 +356,11 @@ class Chroma(VectorStore):
|
||||
raise ValueError(
|
||||
"For update, you must specify an embedding function on creation."
|
||||
)
|
||||
embeddings = self._embedding_function.embed_documents(list(text))
|
||||
embeddings = self._embedding_function.embed_documents([text])
|
||||
|
||||
self._collection.update(
|
||||
ids=[document_id],
|
||||
embeddings=[embeddings[0]],
|
||||
embeddings=embeddings,
|
||||
documents=[text],
|
||||
metadatas=[metadata],
|
||||
)
|
||||
|
@ -3,7 +3,10 @@ import pytest
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.vectorstores import Chroma
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import (
|
||||
ConsistentFakeEmbeddings,
|
||||
FakeEmbeddings,
|
||||
)
|
||||
|
||||
|
||||
def test_chroma() -> None:
|
||||
@ -164,6 +167,8 @@ def test_chroma_with_include_parameter() -> None:
|
||||
|
||||
def test_chroma_update_document() -> None:
|
||||
"""Test the update_document function in the Chroma class."""
|
||||
# Make a consistent embedding
|
||||
embedding = ConsistentFakeEmbeddings()
|
||||
|
||||
# Initial document content and id
|
||||
initial_content = "foo"
|
||||
@ -176,9 +181,12 @@ def test_chroma_update_document() -> None:
|
||||
docsearch = Chroma.from_documents(
|
||||
collection_name="test_collection",
|
||||
documents=[original_doc],
|
||||
embedding=FakeEmbeddings(),
|
||||
embedding=embedding,
|
||||
ids=[document_id],
|
||||
)
|
||||
old_embedding = docsearch._collection.peek()["embeddings"][
|
||||
docsearch._collection.peek()["ids"].index(document_id)
|
||||
]
|
||||
|
||||
# Define updated content for the document
|
||||
updated_content = "updated foo"
|
||||
@ -194,3 +202,10 @@ def test_chroma_update_document() -> None:
|
||||
|
||||
# Assert that the updated document is returned by the search
|
||||
assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
|
||||
|
||||
# Assert that the new embedding is correct
|
||||
new_embedding = docsearch._collection.peek()["embeddings"][
|
||||
docsearch._collection.peek()["ids"].index(document_id)
|
||||
]
|
||||
assert new_embedding == embedding.embed_documents([updated_content])[0]
|
||||
assert new_embedding != old_embedding
|
||||
|
Loading…
Reference in New Issue
Block a user