mirror of
https://github.com/hwchase17/langchain.git
synced 2025-08-20 09:57:32 +00:00
fix chroma update_document to embed entire documents, fixes a characer-wise embedding bug (#5584)
# Chroma update_document full document embeddings bugfix Chroma update_document takes a single document, but treats the page_content sting of that document as a list when getting the new document embedding. This is a two-fold problem, where the resulting embedding for the updated document is incorrect (it's only an embedding of the first character in the new page_content) and it calls the embedding function for every character in the new page_content string, using many tokens in the process. Fixes #5582 Co-authored-by: Caleb Ellington <calebellington@Calebs-MBP.hsd1.ca.comcast.net>
This commit is contained in:
parent
3c6fa9126a
commit
c5a7a85a4e
@ -356,11 +356,11 @@ class Chroma(VectorStore):
|
|||||||
raise ValueError(
|
raise ValueError(
|
||||||
"For update, you must specify an embedding function on creation."
|
"For update, you must specify an embedding function on creation."
|
||||||
)
|
)
|
||||||
embeddings = self._embedding_function.embed_documents(list(text))
|
embeddings = self._embedding_function.embed_documents([text])
|
||||||
|
|
||||||
self._collection.update(
|
self._collection.update(
|
||||||
ids=[document_id],
|
ids=[document_id],
|
||||||
embeddings=[embeddings[0]],
|
embeddings=embeddings,
|
||||||
documents=[text],
|
documents=[text],
|
||||||
metadatas=[metadata],
|
metadatas=[metadata],
|
||||||
)
|
)
|
||||||
|
@ -3,7 +3,10 @@ import pytest
|
|||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.vectorstores import Chroma
|
from langchain.vectorstores import Chroma
|
||||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
from tests.integration_tests.vectorstores.fake_embeddings import (
|
||||||
|
ConsistentFakeEmbeddings,
|
||||||
|
FakeEmbeddings,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def test_chroma() -> None:
|
def test_chroma() -> None:
|
||||||
@ -164,6 +167,8 @@ def test_chroma_with_include_parameter() -> None:
|
|||||||
|
|
||||||
def test_chroma_update_document() -> None:
|
def test_chroma_update_document() -> None:
|
||||||
"""Test the update_document function in the Chroma class."""
|
"""Test the update_document function in the Chroma class."""
|
||||||
|
# Make a consistent embedding
|
||||||
|
embedding = ConsistentFakeEmbeddings()
|
||||||
|
|
||||||
# Initial document content and id
|
# Initial document content and id
|
||||||
initial_content = "foo"
|
initial_content = "foo"
|
||||||
@ -176,9 +181,12 @@ def test_chroma_update_document() -> None:
|
|||||||
docsearch = Chroma.from_documents(
|
docsearch = Chroma.from_documents(
|
||||||
collection_name="test_collection",
|
collection_name="test_collection",
|
||||||
documents=[original_doc],
|
documents=[original_doc],
|
||||||
embedding=FakeEmbeddings(),
|
embedding=embedding,
|
||||||
ids=[document_id],
|
ids=[document_id],
|
||||||
)
|
)
|
||||||
|
old_embedding = docsearch._collection.peek()["embeddings"][
|
||||||
|
docsearch._collection.peek()["ids"].index(document_id)
|
||||||
|
]
|
||||||
|
|
||||||
# Define updated content for the document
|
# Define updated content for the document
|
||||||
updated_content = "updated foo"
|
updated_content = "updated foo"
|
||||||
@ -194,3 +202,10 @@ def test_chroma_update_document() -> None:
|
|||||||
|
|
||||||
# Assert that the updated document is returned by the search
|
# Assert that the updated document is returned by the search
|
||||||
assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
|
assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
|
||||||
|
|
||||||
|
# Assert that the new embedding is correct
|
||||||
|
new_embedding = docsearch._collection.peek()["embeddings"][
|
||||||
|
docsearch._collection.peek()["ids"].index(document_id)
|
||||||
|
]
|
||||||
|
assert new_embedding == embedding.embed_documents([updated_content])[0]
|
||||||
|
assert new_embedding != old_embedding
|
||||||
|
Loading…
Reference in New Issue
Block a user