mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-26 00:23:25 +00:00
community: Correctly handle multi-element rich text (#25762)
**Description:** - Add _concatenate_rich_text method to combine all elements in rich text arrays - Update load_page method to use _concatenate_rich_text for rich text properties - Ensure all text content is captured, including inline code and formatted text - Add unit tests to verify correct handling of multi-element rich text This fix prevents truncation of content after backticks or other formatting elements. **Issue:** Using Notion DB Loader, the text for `richtext` and `title` is truncated after 1st element was loaded as Notion Loader only read the first element. **Dependencies:** any dependencies required for this change None. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
b2102b8cc4
commit
8f5e72de05
@ -107,19 +107,15 @@ class NotionDBLoader(BaseLoader):
|
|||||||
# load properties as metadata
|
# load properties as metadata
|
||||||
metadata: Dict[str, Any] = {}
|
metadata: Dict[str, Any] = {}
|
||||||
|
|
||||||
|
value: Any
|
||||||
|
|
||||||
for prop_name, prop_data in page_summary["properties"].items():
|
for prop_name, prop_data in page_summary["properties"].items():
|
||||||
prop_type = prop_data["type"]
|
prop_type = prop_data["type"]
|
||||||
|
|
||||||
if prop_type == "rich_text":
|
if prop_type == "rich_text":
|
||||||
value = (
|
value = self._concatenate_rich_text(prop_data["rich_text"])
|
||||||
prop_data["rich_text"][0]["plain_text"]
|
|
||||||
if prop_data["rich_text"]
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
elif prop_type == "title":
|
elif prop_type == "title":
|
||||||
value = (
|
value = self._concatenate_rich_text(prop_data["title"])
|
||||||
prop_data["title"][0]["plain_text"] if prop_data["title"] else None
|
|
||||||
)
|
|
||||||
elif prop_type == "multi_select":
|
elif prop_type == "multi_select":
|
||||||
value = (
|
value = (
|
||||||
[item["name"] for item in prop_data["multi_select"]]
|
[item["name"] for item in prop_data["multi_select"]]
|
||||||
@ -228,3 +224,7 @@ class NotionDBLoader(BaseLoader):
|
|||||||
)
|
)
|
||||||
res.raise_for_status()
|
res.raise_for_status()
|
||||||
return res.json()
|
return res.json()
|
||||||
|
|
||||||
|
def _concatenate_rich_text(self, rich_text_array: List[Dict[str, Any]]) -> str:
|
||||||
|
"""Concatenate all text content from a rich_text array."""
|
||||||
|
return "".join(item["plain_text"] for item in rich_text_array)
|
||||||
|
@ -0,0 +1,138 @@
|
|||||||
|
from unittest.mock import Mock, patch
|
||||||
|
|
||||||
|
from langchain_core.documents import Document
|
||||||
|
|
||||||
|
from langchain_community.document_loaders import NotionDBLoader
|
||||||
|
|
||||||
|
|
||||||
|
class TestNotionDBLoader:
|
||||||
|
def setup_method(self) -> None:
|
||||||
|
self.loader = NotionDBLoader(
|
||||||
|
integration_token="fake_token", database_id="fake_db_id"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_concatenate_rich_text(self) -> None:
|
||||||
|
# Setup
|
||||||
|
rich_text = [
|
||||||
|
{"plain_text": "Hello "},
|
||||||
|
{"plain_text": "world"},
|
||||||
|
{"plain_text": "!"},
|
||||||
|
]
|
||||||
|
|
||||||
|
# Exercise
|
||||||
|
result = self.loader._concatenate_rich_text(rich_text)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result == "Hello world!"
|
||||||
|
|
||||||
|
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
|
||||||
|
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
|
||||||
|
def test_load_page_with_rich_text(
|
||||||
|
self, mock_load_blocks: Mock, mock_request: Mock
|
||||||
|
) -> None:
|
||||||
|
# Setup
|
||||||
|
mock_load_blocks.return_value = "Mocked block content"
|
||||||
|
page_summary = {
|
||||||
|
"id": "page_id",
|
||||||
|
"properties": {
|
||||||
|
"Title": {"type": "title", "title": [{"plain_text": "Test Title"}]},
|
||||||
|
"Description": {
|
||||||
|
"type": "rich_text",
|
||||||
|
"rich_text": [
|
||||||
|
{"plain_text": "This is "},
|
||||||
|
{"plain_text": "a test"},
|
||||||
|
{"plain_text": " description"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
expected_doc = Document(
|
||||||
|
page_content="Mocked block content",
|
||||||
|
metadata={
|
||||||
|
"title": "Test Title",
|
||||||
|
"description": "This is a test description",
|
||||||
|
"id": "page_id",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Exercise
|
||||||
|
result = self.loader.load_page(page_summary)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result == expected_doc
|
||||||
|
|
||||||
|
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
|
||||||
|
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
|
||||||
|
def test_load_page_with_code_in_rich_text(
|
||||||
|
self, mock_load_blocks: Mock, mock_request: Mock
|
||||||
|
) -> None:
|
||||||
|
# Setup
|
||||||
|
mock_load_blocks.return_value = "Mocked block content"
|
||||||
|
page_summary = {
|
||||||
|
"id": "page_id",
|
||||||
|
"properties": {
|
||||||
|
"Answer": {
|
||||||
|
"type": "rich_text",
|
||||||
|
"rich_text": [
|
||||||
|
{"plain_text": "Use "},
|
||||||
|
{"plain_text": "print('Hello')"},
|
||||||
|
{"plain_text": " to display text"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
expected_doc = Document(
|
||||||
|
page_content="Mocked block content",
|
||||||
|
metadata={"answer": "Use print('Hello') to display text", "id": "page_id"},
|
||||||
|
)
|
||||||
|
|
||||||
|
# Exercise
|
||||||
|
result = self.loader.load_page(page_summary)
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result == expected_doc
|
||||||
|
|
||||||
|
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
|
||||||
|
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
|
||||||
|
def test_load(self, mock_load_blocks: Mock, mock_request: Mock) -> None:
|
||||||
|
# Setup
|
||||||
|
mock_load_blocks.return_value = "Mocked block content"
|
||||||
|
mock_request.return_value = {
|
||||||
|
"results": [
|
||||||
|
{
|
||||||
|
"id": "page_id_1",
|
||||||
|
"properties": {
|
||||||
|
"Title": {
|
||||||
|
"type": "title",
|
||||||
|
"title": [{"plain_text": "Test Title 1"}],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "page_id_2",
|
||||||
|
"properties": {
|
||||||
|
"Title": {
|
||||||
|
"type": "title",
|
||||||
|
"title": [{"plain_text": "Test Title 2"}],
|
||||||
|
}
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"has_more": False,
|
||||||
|
}
|
||||||
|
expected_docs = [
|
||||||
|
Document(
|
||||||
|
page_content="Mocked block content",
|
||||||
|
metadata={"title": "Test Title 1", "id": "page_id_1"},
|
||||||
|
),
|
||||||
|
Document(
|
||||||
|
page_content="Mocked block content",
|
||||||
|
metadata={"title": "Test Title 2", "id": "page_id_2"},
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
# Exercise
|
||||||
|
result = self.loader.load()
|
||||||
|
|
||||||
|
# Assert
|
||||||
|
assert result == expected_docs
|
Loading…
Reference in New Issue
Block a user