mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-22 15:38:06 +00:00
Harrison/shallow metadata (#1599)
Co-authored-by: Jesse Zhang <jessetanzhang@gmail.com>
This commit is contained in:
parent
c6bfa00178
commit
f95d551f7a
@ -1,6 +1,7 @@
|
||||
"""Functionality for splitting text."""
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import (
|
||||
@ -51,7 +52,10 @@ class TextSplitter(ABC):
|
||||
documents = []
|
||||
for i, text in enumerate(texts):
|
||||
for chunk in self.split_text(text):
|
||||
documents.append(Document(page_content=chunk, metadata=_metadatas[i]))
|
||||
new_doc = Document(
|
||||
page_content=chunk, metadata=copy.deepcopy(_metadatas[i])
|
||||
)
|
||||
documents.append(new_doc)
|
||||
return documents
|
||||
|
||||
def split_documents(self, documents: List[Document]) -> List[Document]:
|
||||
|
@ -94,6 +94,21 @@ def test_create_documents_with_metadata() -> None:
|
||||
assert docs == expected_docs
|
||||
|
||||
|
||||
def test_metadata_not_shallow() -> None:
|
||||
"""Test that metadatas are not shallow."""
|
||||
texts = ["foo bar"]
|
||||
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
|
||||
docs = splitter.create_documents(texts, [{"source": "1"}])
|
||||
expected_docs = [
|
||||
Document(page_content="foo", metadata={"source": "1"}),
|
||||
Document(page_content="bar", metadata={"source": "1"}),
|
||||
]
|
||||
assert docs == expected_docs
|
||||
docs[0].metadata["foo"] = 1
|
||||
assert docs[0].metadata == {"source": "1", "foo": 1}
|
||||
assert docs[1].metadata == {"source": "1"}
|
||||
|
||||
|
||||
def test_iterative_text_splitter() -> None:
|
||||
"""Test iterative text splitter."""
|
||||
text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.
|
||||
|
Loading…
Reference in New Issue
Block a user