upstage[minor]: add merge_and_split function for document loader (#21603)

- Introduce the `merge_and_split` function in the `UpstageLayoutAnalysisLoader`. - The `merge_and_split` function takes a list of documents and a splitter as inputs. - This function merges all documents and then divides them using the `split_documents` method, which is a proprietary function of the splitter. - If the provided splitter is `None` (which is the default setting), the function will simply merge the documents without splitting them.
2025-07-21 12:01:47 +00:00 · 2024-05-13 23:55:19 +09:00 · 2024-05-13 23:55:19 +09:00 · 480c02bf55
commit 480c02bf55
parent 500569da48
2 changed files with 45 additions and 1 deletions
--- a/libs/partners/upstage/langchain_upstage/layout_analysis.py
+++ b/libs/partners/upstage/langchain_upstage/layout_analysis.py
@ -1,7 +1,7 @@
 import os
 import warnings
 from pathlib import Path
-from typing import Iterator, List, Literal, Optional, Union
+from typing import Any, Dict, Iterator, List, Literal, Optional, Union
 from langchain_core.document_loaders import BaseLoader, Blob
 from langchain_core.documents import Document
@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader):
                exclude=self.exclude,
            )
            yield from parser.lazy_parse(blob)
    def merge_and_split(
        self, documents: List[Document], splitter: Optional[object] = None
    ) -> List[Document]:
        """
        Merges the page content and metadata of multiple documents into a single
        document, or splits the documents using a custom splitter.
        Args:
            documents (list): A list of Document objects to be merged and split.
            splitter (object, optional): An optional splitter object that implements the
                `split_documents` method. If provided, the documents will be split using
                this splitter. Defaults to None, in which case the documents are merged.
        Returns:
            list: A list of Document objects. If no splitter is provided, a single
            Document object is returned with the merged content and combined metadata.
            If a splitter is provided, the documents are split and a list of Document
            objects is returned.
        Raises:
            AssertionError: If a splitter is provided but it does not implement the
            `split_documents` method.
        """
        if splitter is None:
            merged_content = " ".join([doc.page_content for doc in documents])
            metadatas: Dict[str, Any] = dict()
            for _meta in [doc.metadata for doc in documents]:
                for key, value in _meta.items():
                    if key in metadatas:
                        metadatas[key].append(value)
                    else:
                        metadatas[key] = [value]
            return [Document(page_content=merged_content, metadata=metadatas)]
        else:
            assert hasattr(
                splitter, "split_documents"
            ), "splitter must implement split_documents method"
            return splitter.split_documents(documents)
--- a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
+++ b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py
@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser):
                "id": elements["id"],
                "type": self.output_type,
                "split": self.split,
                "bbox": elements["bounding_box"],
                "category": elements["category"],
            },
        )