From f95d551f7a37b8bef3dead1432647dfc87ac3cec Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Sat, 11 Mar 2023 09:18:25 -0800 Subject: [PATCH] Harrison/shallow metadata (#1599) Co-authored-by: Jesse Zhang --- langchain/text_splitter.py | 6 +++++- tests/unit_tests/test_text_splitter.py | 15 +++++++++++++++ 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/langchain/text_splitter.py b/langchain/text_splitter.py index 7b3bc3268e7..4e297e3b0e0 100644 --- a/langchain/text_splitter.py +++ b/langchain/text_splitter.py @@ -1,6 +1,7 @@ """Functionality for splitting text.""" from __future__ import annotations +import copy import logging from abc import ABC, abstractmethod from typing import ( @@ -51,7 +52,10 @@ class TextSplitter(ABC): documents = [] for i, text in enumerate(texts): for chunk in self.split_text(text): - documents.append(Document(page_content=chunk, metadata=_metadatas[i])) + new_doc = Document( + page_content=chunk, metadata=copy.deepcopy(_metadatas[i]) + ) + documents.append(new_doc) return documents def split_documents(self, documents: List[Document]) -> List[Document]: diff --git a/tests/unit_tests/test_text_splitter.py b/tests/unit_tests/test_text_splitter.py index 90c213723fd..dbfb9b5fe2b 100644 --- a/tests/unit_tests/test_text_splitter.py +++ b/tests/unit_tests/test_text_splitter.py @@ -94,6 +94,21 @@ def test_create_documents_with_metadata() -> None: assert docs == expected_docs +def test_metadata_not_shallow() -> None: + """Test that metadatas are not shallow.""" + texts = ["foo bar"] + splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0) + docs = splitter.create_documents(texts, [{"source": "1"}]) + expected_docs = [ + Document(page_content="foo", metadata={"source": "1"}), + Document(page_content="bar", metadata={"source": "1"}), + ] + assert docs == expected_docs + docs[0].metadata["foo"] = 1 + assert docs[0].metadata == {"source": "1", "foo": 1} + assert docs[1].metadata == {"source": "1"} + + def test_iterative_text_splitter() -> None: """Test iterative text splitter.""" text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.