From 480c02bf553de894cedc60504b126807dd6dea00 Mon Sep 17 00:00:00 2001 From: junkeon <35945268+junkeon@users.noreply.github.com> Date: Mon, 13 May 2024 23:55:19 +0900 Subject: [PATCH] upstage[minor]: add merge_and_split function for document loader (#21603) - Introduce the `merge_and_split` function in the `UpstageLayoutAnalysisLoader`. - The `merge_and_split` function takes a list of documents and a splitter as inputs. - This function merges all documents and then divides them using the `split_documents` method, which is a proprietary function of the splitter. - If the provided splitter is `None` (which is the default setting), the function will simply merge the documents without splitting them. --- .../langchain_upstage/layout_analysis.py | 44 ++++++++++++++++++- .../layout_analysis_parsers.py | 2 + 2 files changed, 45 insertions(+), 1 deletion(-) diff --git a/libs/partners/upstage/langchain_upstage/layout_analysis.py b/libs/partners/upstage/langchain_upstage/layout_analysis.py index 40dd74b3444..fc68f4d1398 100644 --- a/libs/partners/upstage/langchain_upstage/layout_analysis.py +++ b/libs/partners/upstage/langchain_upstage/layout_analysis.py @@ -1,7 +1,7 @@ import os import warnings from pathlib import Path -from typing import Iterator, List, Literal, Optional, Union +from typing import Any, Dict, Iterator, List, Literal, Optional, Union from langchain_core.document_loaders import BaseLoader, Blob from langchain_core.documents import Document @@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader): exclude=self.exclude, ) yield from parser.lazy_parse(blob) + + def merge_and_split( + self, documents: List[Document], splitter: Optional[object] = None + ) -> List[Document]: + """ + Merges the page content and metadata of multiple documents into a single + document, or splits the documents using a custom splitter. + + Args: + documents (list): A list of Document objects to be merged and split. + splitter (object, optional): An optional splitter object that implements the + `split_documents` method. If provided, the documents will be split using + this splitter. Defaults to None, in which case the documents are merged. + + Returns: + list: A list of Document objects. If no splitter is provided, a single + Document object is returned with the merged content and combined metadata. + If a splitter is provided, the documents are split and a list of Document + objects is returned. + + Raises: + AssertionError: If a splitter is provided but it does not implement the + `split_documents` method. + """ + if splitter is None: + merged_content = " ".join([doc.page_content for doc in documents]) + + metadatas: Dict[str, Any] = dict() + for _meta in [doc.metadata for doc in documents]: + for key, value in _meta.items(): + if key in metadatas: + metadatas[key].append(value) + else: + metadatas[key] = [value] + + return [Document(page_content=merged_content, metadata=metadatas)] + else: + assert hasattr( + splitter, "split_documents" + ), "splitter must implement split_documents method" + + return splitter.split_documents(documents) diff --git a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py index 7979db12757..5a4056dfe61 100644 --- a/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py +++ b/libs/partners/upstage/langchain_upstage/layout_analysis_parsers.py @@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser): "id": elements["id"], "type": self.output_type, "split": self.split, + "bbox": elements["bounding_box"], + "category": elements["category"], }, )