From 480c02bf553de894cedc60504b126807dd6dea00 Mon Sep 17 00:00:00 2001
From: junkeon <35945268+junkeon@users.noreply.github.com>
Date: Mon, 13 May 2024 23:55:19 +0900
Subject: [PATCH] upstage[minor]: add merge_and_split function for document
 loader (#21603)

- Introduce the `merge_and_split` function in the
`UpstageLayoutAnalysisLoader`.
- The `merge_and_split` function takes a list of documents and a
splitter as inputs.
- This function merges all documents and then divides them using the
`split_documents` method, which is a proprietary function of the
splitter.
- If the provided splitter is `None` (which is the default setting), the
function will simply merge the documents without splitting them.
---
 .../langchain_upstage/layout_analysis.py      | 44 ++++++++++++++++++-
 .../layout_analysis_parsers.py                |  2 +
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/libs/partners/upstage/langchain_upstage/layout_analysis.py b/libs/partners/upstage/langchain_upstage/layout_analysis.py
index 40dd74b3444..fc68f4d1398 100644
--- a/libs/partners/upstage/langchain_upstage/layout_analysis.py
+++ b/libs/partners/upstage/langchain_upstage/layout_analysis.py
@@ -1,7 +1,7 @@
 import os
 import warnings
 from pathlib import Path
-from typing import Iterator, List, Literal, Optional, Union
+from typing import Any, Dict, Iterator, List, Literal, Optional, Union
 
 from langchain_core.document_loaders import BaseLoader, Blob
 from langchain_core.documents import Document
@@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader):
                 exclude=self.exclude,
             )
             yield from parser.lazy_parse(blob)
+
+    def merge_and_split(
+        self, documents: List[Document], splitter: Optional[object] = None
+    ) -> List[Document]:
+        """
+        Merges the page content and metadata of multiple documents into a single
+        document, or splits the documents using a custom splitter.
+
+        Args:
+            documents (list): A list of Document objects to be merged and split.
+            splitter (object, optional): An optional splitter object that implements the
+                `split_documents` method. If provided, the documents will be split using
+                this splitter. Defaults to None, in which case the documents are merged.
+
+        Returns:
+            list: A list of Document objects. If no splitter is provided, a single
+            Document object is returned with the merged content and combined metadata.
+            If a splitter is provided, the documents are split and a list of Document
+            objects is returned.
+
+        Raises:
+            AssertionError: If a splitter is provided but it does not implement the
+            `split_documents` method.
+        """
+        if splitter is None:
+            merged_content = " ".join([doc.page_content for doc in documents])
+
+            metadatas: Dict[str, Any] = dict()
+            for _meta in [doc.metadata for doc in documents]:
+                for key, value in _meta.items():
+                    if key in metadatas:
+                        metadatas[key].append(value)
+                    else:
+                        metadatas[key] = [value]
+
+            return [Document(page_content=merged_content, metadata=metadatas)]
+        else:
+            assert hasattr(
+                splitter, "split_documents"
+            ), "splitter must implement split_documents method"
+
+            return splitter.split_documents(documents)
diff --git a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
index 7979db12757..5a4056dfe61 100644
--- a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
+++ b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
@@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser):
                 "id": elements["id"],
                 "type": self.output_type,
                 "split": self.split,
+                "bbox": elements["bounding_box"],
+                "category": elements["category"],
             },
         )