mirror of
https://github.com/hwchase17/langchain.git
synced 2025-06-25 08:03:39 +00:00
community: Correctly handle multi-element rich text (#25762)
**Description:** - Add _concatenate_rich_text method to combine all elements in rich text arrays - Update load_page method to use _concatenate_rich_text for rich text properties - Ensure all text content is captured, including inline code and formatted text - Add unit tests to verify correct handling of multi-element rich text This fix prevents truncation of content after backticks or other formatting elements. **Issue:** Using Notion DB Loader, the text for `richtext` and `title` is truncated after 1st element was loaded as Notion Loader only read the first element. **Dependencies:** any dependencies required for this change None. --------- Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
parent
b2102b8cc4
commit
8f5e72de05
@ -107,19 +107,15 @@ class NotionDBLoader(BaseLoader):
|
||||
# load properties as metadata
|
||||
metadata: Dict[str, Any] = {}
|
||||
|
||||
value: Any
|
||||
|
||||
for prop_name, prop_data in page_summary["properties"].items():
|
||||
prop_type = prop_data["type"]
|
||||
|
||||
if prop_type == "rich_text":
|
||||
value = (
|
||||
prop_data["rich_text"][0]["plain_text"]
|
||||
if prop_data["rich_text"]
|
||||
else None
|
||||
)
|
||||
value = self._concatenate_rich_text(prop_data["rich_text"])
|
||||
elif prop_type == "title":
|
||||
value = (
|
||||
prop_data["title"][0]["plain_text"] if prop_data["title"] else None
|
||||
)
|
||||
value = self._concatenate_rich_text(prop_data["title"])
|
||||
elif prop_type == "multi_select":
|
||||
value = (
|
||||
[item["name"] for item in prop_data["multi_select"]]
|
||||
@ -228,3 +224,7 @@ class NotionDBLoader(BaseLoader):
|
||||
)
|
||||
res.raise_for_status()
|
||||
return res.json()
|
||||
|
||||
def _concatenate_rich_text(self, rich_text_array: List[Dict[str, Any]]) -> str:
|
||||
"""Concatenate all text content from a rich_text array."""
|
||||
return "".join(item["plain_text"] for item in rich_text_array)
|
||||
|
@ -0,0 +1,138 @@
|
||||
from unittest.mock import Mock, patch
|
||||
|
||||
from langchain_core.documents import Document
|
||||
|
||||
from langchain_community.document_loaders import NotionDBLoader
|
||||
|
||||
|
||||
class TestNotionDBLoader:
|
||||
def setup_method(self) -> None:
|
||||
self.loader = NotionDBLoader(
|
||||
integration_token="fake_token", database_id="fake_db_id"
|
||||
)
|
||||
|
||||
def test_concatenate_rich_text(self) -> None:
|
||||
# Setup
|
||||
rich_text = [
|
||||
{"plain_text": "Hello "},
|
||||
{"plain_text": "world"},
|
||||
{"plain_text": "!"},
|
||||
]
|
||||
|
||||
# Exercise
|
||||
result = self.loader._concatenate_rich_text(rich_text)
|
||||
|
||||
# Assert
|
||||
assert result == "Hello world!"
|
||||
|
||||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
|
||||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
|
||||
def test_load_page_with_rich_text(
|
||||
self, mock_load_blocks: Mock, mock_request: Mock
|
||||
) -> None:
|
||||
# Setup
|
||||
mock_load_blocks.return_value = "Mocked block content"
|
||||
page_summary = {
|
||||
"id": "page_id",
|
||||
"properties": {
|
||||
"Title": {"type": "title", "title": [{"plain_text": "Test Title"}]},
|
||||
"Description": {
|
||||
"type": "rich_text",
|
||||
"rich_text": [
|
||||
{"plain_text": "This is "},
|
||||
{"plain_text": "a test"},
|
||||
{"plain_text": " description"},
|
||||
],
|
||||
},
|
||||
},
|
||||
}
|
||||
expected_doc = Document(
|
||||
page_content="Mocked block content",
|
||||
metadata={
|
||||
"title": "Test Title",
|
||||
"description": "This is a test description",
|
||||
"id": "page_id",
|
||||
},
|
||||
)
|
||||
|
||||
# Exercise
|
||||
result = self.loader.load_page(page_summary)
|
||||
|
||||
# Assert
|
||||
assert result == expected_doc
|
||||
|
||||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
|
||||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
|
||||
def test_load_page_with_code_in_rich_text(
|
||||
self, mock_load_blocks: Mock, mock_request: Mock
|
||||
) -> None:
|
||||
# Setup
|
||||
mock_load_blocks.return_value = "Mocked block content"
|
||||
page_summary = {
|
||||
"id": "page_id",
|
||||
"properties": {
|
||||
"Answer": {
|
||||
"type": "rich_text",
|
||||
"rich_text": [
|
||||
{"plain_text": "Use "},
|
||||
{"plain_text": "print('Hello')"},
|
||||
{"plain_text": " to display text"},
|
||||
],
|
||||
}
|
||||
},
|
||||
}
|
||||
expected_doc = Document(
|
||||
page_content="Mocked block content",
|
||||
metadata={"answer": "Use print('Hello') to display text", "id": "page_id"},
|
||||
)
|
||||
|
||||
# Exercise
|
||||
result = self.loader.load_page(page_summary)
|
||||
|
||||
# Assert
|
||||
assert result == expected_doc
|
||||
|
||||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._request")
|
||||
@patch("langchain_community.document_loaders.notiondb.NotionDBLoader._load_blocks")
|
||||
def test_load(self, mock_load_blocks: Mock, mock_request: Mock) -> None:
|
||||
# Setup
|
||||
mock_load_blocks.return_value = "Mocked block content"
|
||||
mock_request.return_value = {
|
||||
"results": [
|
||||
{
|
||||
"id": "page_id_1",
|
||||
"properties": {
|
||||
"Title": {
|
||||
"type": "title",
|
||||
"title": [{"plain_text": "Test Title 1"}],
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
"id": "page_id_2",
|
||||
"properties": {
|
||||
"Title": {
|
||||
"type": "title",
|
||||
"title": [{"plain_text": "Test Title 2"}],
|
||||
}
|
||||
},
|
||||
},
|
||||
],
|
||||
"has_more": False,
|
||||
}
|
||||
expected_docs = [
|
||||
Document(
|
||||
page_content="Mocked block content",
|
||||
metadata={"title": "Test Title 1", "id": "page_id_1"},
|
||||
),
|
||||
Document(
|
||||
page_content="Mocked block content",
|
||||
metadata={"title": "Test Title 2", "id": "page_id_2"},
|
||||
),
|
||||
]
|
||||
|
||||
# Exercise
|
||||
result = self.loader.load()
|
||||
|
||||
# Assert
|
||||
assert result == expected_docs
|
Loading…
Reference in New Issue
Block a user