mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-05 04:38:26 +00:00
Fix update_document function, add test and documentation. (#5359)
# Fix for `update_document` Function in Chroma ## Summary This pull request addresses an issue with the `update_document` function in the Chroma class, as described in [#5031](https://github.com/hwchase17/langchain/issues/5031#issuecomment-1562577947). The issue was identified as an `AttributeError` raised when calling `update_document` due to a missing corresponding method in the `Collection` object. This fix refactors the `update_document` method in `Chroma` to correctly interact with the `Collection` object. ## Changes 1. Fixed the `update_document` method in the `Chroma` class to correctly call methods on the `Collection` object. 2. Added the corresponding test `test_chroma_update_document` in `tests/integration_tests/vectorstores/test_chroma.py` to reflect the updated method call. 3. Added an example and explanation of how to use the `update_document` function in the Jupyter notebook tutorial for Chroma. ## Test Plan All existing tests pass after this change. In addition, the `test_chroma_update_document` test case now correctly checks the functionality of `update_document`, ensuring that the function works as expected and updates the content of documents correctly. ## Reviewers @dev2049 This fix will ensure that users are able to use the `update_document` function as expected, without encountering the previous `AttributeError`. This will enhance the usability and reliability of the Chroma class for all users. Thank you for considering this pull request. I look forward to your feedback and suggestions.
This commit is contained in:
parent
e455ba4ed5
commit
44b48d9518
@ -1,6 +1,7 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "683953b3",
|
||||
"metadata": {},
|
||||
@ -33,7 +34,7 @@
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdin",
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" ········\n"
|
||||
@ -86,7 +87,6 @@
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import TextLoader\n",
|
||||
"loader = TextLoader('../../../state_of_the_union.txt')\n",
|
||||
"documents = loader.load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
@ -143,6 +143,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "18152965",
|
||||
"metadata": {},
|
||||
@ -187,6 +188,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "8061454b",
|
||||
"metadata": {},
|
||||
@ -197,6 +199,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "2b76db26",
|
||||
"metadata": {},
|
||||
@ -232,6 +235,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "f568a322",
|
||||
"metadata": {},
|
||||
@ -262,6 +266,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "cc9ed900",
|
||||
"metadata": {},
|
||||
@ -292,6 +297,7 @@
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "794a7552",
|
||||
"metadata": {},
|
||||
@ -336,13 +342,81 @@
|
||||
"retriever.get_relevant_documents(query)[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "2a877f08",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Updating a Document\n",
|
||||
"The `update_document` function allows you to modify the content of a document in the Chroma instance after it has been added. Let's see an example of how to use this function."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"execution_count": 20,
|
||||
"id": "a559c3f1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
"source": [
|
||||
"# Import Document class\n",
|
||||
"from langchain.docstore.document import Document\n",
|
||||
"\n",
|
||||
"# Initial document content and id\n",
|
||||
"initial_content = \"This is an initial document content\"\n",
|
||||
"document_id = \"doc1\"\n",
|
||||
"\n",
|
||||
"# Create an instance of Document with initial content and metadata\n",
|
||||
"original_doc = Document(page_content=initial_content, metadata={\"page\": \"0\"})\n",
|
||||
"\n",
|
||||
"# Initialize a Chroma instance with the original document\n",
|
||||
"new_db = Chroma.from_documents(\n",
|
||||
" collection_name=\"test_collection\",\n",
|
||||
" documents=[original_doc],\n",
|
||||
" embedding=OpenAIEmbeddings(), # using the same embeddings as before\n",
|
||||
" ids=[document_id],\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"id": "60a7c273",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"At this point, we have a new Chroma instance with a single document \"This is an initial document content\" with id \"doc1\". Now, let's update the content of the document."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 22,
|
||||
"id": "55e48056",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"This is the updated document content {'page': '1'}\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Updated document content\n",
|
||||
"updated_content = \"This is the updated document content\"\n",
|
||||
"\n",
|
||||
"# Create a new Document instance with the updated content\n",
|
||||
"updated_doc = Document(page_content=updated_content, metadata={\"page\": \"1\"})\n",
|
||||
"\n",
|
||||
"# Update the document in the Chroma instance by passing the document id and the updated document\n",
|
||||
"new_db.update_document(document_id=document_id, document=updated_doc)\n",
|
||||
"\n",
|
||||
"# Now, let's retrieve the updated document using similarity search\n",
|
||||
"output = new_db.similarity_search(updated_content, k=1)\n",
|
||||
"\n",
|
||||
"# Print the content of the retrieved document\n",
|
||||
"print(output[0].page_content, output[0].metadata)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
|
@ -348,7 +348,18 @@ class Chroma(VectorStore):
|
||||
"""
|
||||
text = document.page_content
|
||||
metadata = document.metadata
|
||||
self._collection.update_document(document_id, text, metadata)
|
||||
if self._embedding_function is None:
|
||||
raise ValueError(
|
||||
"For update, you must specify an embedding function on creation."
|
||||
)
|
||||
embeddings = self._embedding_function.embed_documents(list(text))
|
||||
|
||||
self._collection.update(
|
||||
ids=[document_id],
|
||||
embeddings=[embeddings[0]],
|
||||
documents=[text],
|
||||
metadatas=[metadata],
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
|
@ -160,3 +160,37 @@ def test_chroma_with_include_parameter() -> None:
|
||||
assert output["embeddings"] is not None
|
||||
output = docsearch.get()
|
||||
assert output["embeddings"] is None
|
||||
|
||||
|
||||
def test_chroma_update_document() -> None:
|
||||
"""Test the update_document function in the Chroma class."""
|
||||
|
||||
# Initial document content and id
|
||||
initial_content = "foo"
|
||||
document_id = "doc1"
|
||||
|
||||
# Create an instance of Document with initial content and metadata
|
||||
original_doc = Document(page_content=initial_content, metadata={"page": "0"})
|
||||
|
||||
# Initialize a Chroma instance with the original document
|
||||
docsearch = Chroma.from_documents(
|
||||
collection_name="test_collection",
|
||||
documents=[original_doc],
|
||||
embedding=FakeEmbeddings(),
|
||||
ids=[document_id],
|
||||
)
|
||||
|
||||
# Define updated content for the document
|
||||
updated_content = "updated foo"
|
||||
|
||||
# Create a new Document instance with the updated content and the same id
|
||||
updated_doc = Document(page_content=updated_content, metadata={"page": "0"})
|
||||
|
||||
# Update the document in the Chroma instance
|
||||
docsearch.update_document(document_id=document_id, document=updated_doc)
|
||||
|
||||
# Perform a similarity search with the updated content
|
||||
output = docsearch.similarity_search(updated_content, k=1)
|
||||
|
||||
# Assert that the updated document is returned by the search
|
||||
assert output == [Document(page_content=updated_content, metadata={"page": "0"})]
|
||||
|
Loading…
Reference in New Issue
Block a user