upstage[minor]: add merge_and_split function for document loader (#21603)

- Introduce the `merge_and_split` function in the
`UpstageLayoutAnalysisLoader`.
- The `merge_and_split` function takes a list of documents and a
splitter as inputs.
- This function merges all documents and then divides them using the
`split_documents` method, which is a proprietary function of the
splitter.
- If the provided splitter is `None` (which is the default setting), the
function will simply merge the documents without splitting them.
This commit is contained in:
junkeon 2024-05-13 23:55:19 +09:00 committed by GitHub
parent 500569da48
commit 480c02bf55
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 45 additions and 1 deletions

View File

@ -1,7 +1,7 @@
import os import os
import warnings import warnings
from pathlib import Path from pathlib import Path
from typing import Iterator, List, Literal, Optional, Union from typing import Any, Dict, Iterator, List, Literal, Optional, Union
from langchain_core.document_loaders import BaseLoader, Blob from langchain_core.document_loaders import BaseLoader, Blob
from langchain_core.documents import Document from langchain_core.documents import Document
@ -204,3 +204,45 @@ class UpstageLayoutAnalysisLoader(BaseLoader):
exclude=self.exclude, exclude=self.exclude,
) )
yield from parser.lazy_parse(blob) yield from parser.lazy_parse(blob)
def merge_and_split(
self, documents: List[Document], splitter: Optional[object] = None
) -> List[Document]:
"""
Merges the page content and metadata of multiple documents into a single
document, or splits the documents using a custom splitter.
Args:
documents (list): A list of Document objects to be merged and split.
splitter (object, optional): An optional splitter object that implements the
`split_documents` method. If provided, the documents will be split using
this splitter. Defaults to None, in which case the documents are merged.
Returns:
list: A list of Document objects. If no splitter is provided, a single
Document object is returned with the merged content and combined metadata.
If a splitter is provided, the documents are split and a list of Document
objects is returned.
Raises:
AssertionError: If a splitter is provided but it does not implement the
`split_documents` method.
"""
if splitter is None:
merged_content = " ".join([doc.page_content for doc in documents])
metadatas: Dict[str, Any] = dict()
for _meta in [doc.metadata for doc in documents]:
for key, value in _meta.items():
if key in metadatas:
metadatas[key].append(value)
else:
metadatas[key] = [value]
return [Document(page_content=merged_content, metadata=metadatas)]
else:
assert hasattr(
splitter, "split_documents"
), "splitter must implement split_documents method"
return splitter.split_documents(documents)

View File

@ -246,6 +246,8 @@ class UpstageLayoutAnalysisParser(BaseBlobParser):
"id": elements["id"], "id": elements["id"],
"type": self.output_type, "type": self.output_type,
"split": self.split, "split": self.split,
"bbox": elements["bounding_box"],
"category": elements["category"],
}, },
) )