diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index 7b3bc3268e7..4e297e3b0e0 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -1,6 +1,7 @@ """Functionality for splitting text.""" from __future__ import annotations +import copy import logging from abc import ABC, abstractmethod from typing import ( @@ -51,7 +52,10 @@ class TextSplitter(ABC): documents = [] for i, text in enumerate(texts): for chunk in self.split_text(text): - documents.append(Document(page_content=chunk, metadata=_metadatas[i])) + new_doc = Document( + page_content=chunk, metadata=copy.deepcopy(_metadatas[i]) + ) + documents.append(new_doc) return documents def split_documents(self, documents: List[Document]) -> List[Document]: diff --git a/tests/unit_tests/test_text_splitter.py b/tests/unit_tests/test_text_splitter.py index 90c213723fd..dbfb9b5fe2b 100644 --- a/tests/unit_tests/test_text_splitter.py +++ b/tests/unit_tests/test_text_splitter.py @@ -94,6 +94,21 @@ def test_create_documents_with_metadata() -> None: assert docs == expected_docs +def test_metadata_not_shallow() -> None: + """Test that metadatas are not shallow.""" + texts = ["foo bar"] + splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0) + docs = splitter.create_documents(texts, [{"source": "1"}]) + expected_docs = [ + Document(page_content="foo", metadata={"source": "1"}), + Document(page_content="bar", metadata={"source": "1"}), + ] + assert docs == expected_docs + docs[0].metadata["foo"] = 1 + assert docs[0].metadata == {"source": "1", "foo": 1} + assert docs[1].metadata == {"source": "1"} + + def test_iterative_text_splitter() -> None: """Test iterative text splitter.""" text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.