mirror of
https://github.com/hwchase17/langchain.git
synced 2025-05-23 16:08:10 +00:00
Harrison/shallow metadata (#1599)
Co-authored-by: Jesse Zhang <jessetanzhang@gmail.com>
This commit is contained in:
parent
c6bfa00178
commit
f95d551f7a
@ -1,6 +1,7 @@
|
|||||||
"""Functionality for splitting text."""
|
"""Functionality for splitting text."""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import copy
|
||||||
import logging
|
import logging
|
||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import (
|
from typing import (
|
||||||
@ -51,7 +52,10 @@ class TextSplitter(ABC):
|
|||||||
documents = []
|
documents = []
|
||||||
for i, text in enumerate(texts):
|
for i, text in enumerate(texts):
|
||||||
for chunk in self.split_text(text):
|
for chunk in self.split_text(text):
|
||||||
documents.append(Document(page_content=chunk, metadata=_metadatas[i]))
|
new_doc = Document(
|
||||||
|
page_content=chunk, metadata=copy.deepcopy(_metadatas[i])
|
||||||
|
)
|
||||||
|
documents.append(new_doc)
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
def split_documents(self, documents: List[Document]) -> List[Document]:
|
def split_documents(self, documents: List[Document]) -> List[Document]:
|
||||||
|
@ -94,6 +94,21 @@ def test_create_documents_with_metadata() -> None:
|
|||||||
assert docs == expected_docs
|
assert docs == expected_docs
|
||||||
|
|
||||||
|
|
||||||
|
def test_metadata_not_shallow() -> None:
|
||||||
|
"""Test that metadatas are not shallow."""
|
||||||
|
texts = ["foo bar"]
|
||||||
|
splitter = CharacterTextSplitter(separator=" ", chunk_size=3, chunk_overlap=0)
|
||||||
|
docs = splitter.create_documents(texts, [{"source": "1"}])
|
||||||
|
expected_docs = [
|
||||||
|
Document(page_content="foo", metadata={"source": "1"}),
|
||||||
|
Document(page_content="bar", metadata={"source": "1"}),
|
||||||
|
]
|
||||||
|
assert docs == expected_docs
|
||||||
|
docs[0].metadata["foo"] = 1
|
||||||
|
assert docs[0].metadata == {"source": "1", "foo": 1}
|
||||||
|
assert docs[1].metadata == {"source": "1"}
|
||||||
|
|
||||||
|
|
||||||
def test_iterative_text_splitter() -> None:
|
def test_iterative_text_splitter() -> None:
|
||||||
"""Test iterative text splitter."""
|
"""Test iterative text splitter."""
|
||||||
text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.
|
text = """Hi.\n\nI'm Harrison.\n\nHow? Are? You?\nOkay then f f f f.
|
||||||
|
Loading…
Reference in New Issue
Block a user