community: Correctly handle multi-element rich text (#25762)

**Description:**

- Add _concatenate_rich_text method to combine all elements in rich text
arrays
- Update load_page method to use _concatenate_rich_text for rich text
properties
- Ensure all text content is captured, including inline code and
formatted text
- Add unit tests to verify correct handling of multi-element rich text
This fix prevents truncation of content after backticks or other
formatting elements.

 **Issue:**

Using Notion DB Loader, the text for `richtext` and `title` is truncated
after 1st element was loaded as Notion Loader only read the first
element.

**Dependencies:** any dependencies required for this change
None.

---------

Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com>
Co-authored-by: Erick Friis <erick@langchain.dev>
This commit is contained in:
Hiros
2024-12-16 12:20:27 -08:00
committed by GitHub
parent b2102b8cc4
commit 8f5e72de05
2 changed files with 146 additions and 8 deletions

View File

@@ -107,19 +107,15 @@ class NotionDBLoader(BaseLoader):
# load properties as metadata
metadata: Dict[str, Any] = {}
value: Any
for prop_name, prop_data in page_summary["properties"].items():
prop_type = prop_data["type"]
if prop_type == "rich_text":
value = (
prop_data["rich_text"][0]["plain_text"]
if prop_data["rich_text"]
else None
)
value = self._concatenate_rich_text(prop_data["rich_text"])
elif prop_type == "title":
value = (
prop_data["title"][0]["plain_text"] if prop_data["title"] else None
)
value = self._concatenate_rich_text(prop_data["title"])
elif prop_type == "multi_select":
value = (
[item["name"] for item in prop_data["multi_select"]]
@@ -228,3 +224,7 @@ class NotionDBLoader(BaseLoader):
)
res.raise_for_status()
return res.json()
def _concatenate_rich_text(self, rich_text_array: List[Dict[str, Any]]) -> str:
"""Concatenate all text content from a rich_text array."""
return "".join(item["plain_text"] for item in rich_text_array)